def test_colleague_model_search_result_dict_with_urls(self):
        source = factory.SourceFactory()
        colleague = factory.ColleagueFactory()
        instances = DBSession.query(Colleague).all()
        self.assertEqual(1, len(instances))
        self.assertEqual(colleague, instances[0])
        
        colleague_url_1 = factory.ColleagueUrlFactory(colleague_id=colleague.colleague_id)
        colleague_url_2 = factory.ColleagueUrlFactory(colleague_id=colleague.colleague_id, url_type="Lab")

        instances = DBSession.query(Colleague).all()
        self.assertEqual(1, len(instances))
        self.assertEqual(colleague, instances[0])
        
        self.assertEqual(colleague.to_search_results_dict(), {
            'format_name': colleague.format_name,
            'first_name': colleague.first_name,
            'last_name': colleague.last_name,
            'organization': colleague.institution,
            'work_phone': colleague.work_phone,
            'fax': colleague.fax,
            'email': colleague.email,
            'webpages': {
                'lab_url': colleague_url_2.obj_url,
                'research_summary_url': colleague_url_1.obj_url
            }
        })
def index_colleagues():
    colleagues = DBSession.query(Colleague).all()

    print "Indexing " + str(len(colleagues)) + " colleagues"

    bulk_data = []
    for c in colleagues:
        description_fields = []
        for field in [c.institution, c.country]:
            if field:
                description_fields.append(field)
        description = ", ".join(description_fields)

        position = "Lab Member"
        if c.is_pi == 1:
            position = "Head of Lab"

        locus = set()
        locus_ids = DBSession.query(ColleagueLocus.locus_id).filter(ColleagueLocus.colleague_id == c.colleague_id).all()
        if len(locus_ids) > 0:
            ids_query = [k[0] for k in locus_ids]
            locus_names = (
                DBSession.query(Locusdbentity.gene_name, Locusdbentity.systematic_name)
                .filter(Locusdbentity.dbentity_id.in_(ids_query))
                .all()
            )
            for l in locus_names:
                if l[0]:
                    locus.add(l[0])
                if l[1]:
                    locus.add(l[1])

        obj = {
            "name": c.last_name + ", " + c.first_name,
            "category": "colleague",
            "href": "/colleague/" + c.format_name + "/overview",
            "description": description,
            "first_name": c.first_name,
            "last_name": c.last_name,
            "institution": c.institution,
            "position": position,
            "country": c.country,
            "state": c.state,
            "colleague_loci": sorted(list(locus)),
        }

        c._include_keywords_to_dict(obj)  # adds 'keywords' to obj

        bulk_data.append({"index": {"_index": INDEX_NAME, "_type": DOC_TYPE, "_id": c.format_name}})

        bulk_data.append(obj)

        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
    def test_dbuser_model(self):
        instances = DBSession.query(Dbuser).all()
        self.assertEqual(0, len(instances))

        dbuser = factory.DbuserFactory()
        instances = DBSession.query(Dbuser).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(dbuser, instances[0])
    def test_source_model(self):
        instances = DBSession.query(Source).all()
        self.assertEqual(0, len(instances))

        source = factory.SourceFactory()
        instances = DBSession.query(Source).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(source, instances[0])
def index_phenotypes():
    phenotypes = DBSession.query(Phenotype).all()

    bulk_data = []

    print "Indexing " + str(len(phenotypes)) + " phenotypes"

    for phenotype in phenotypes:
        annotations = DBSession.query(Phenotypeannotation).filter_by(phenotype_id=phenotype.phenotype_id).all()

        references = set([])
        loci = set([])
        chemicals = set([])
        mutant = set([])
        for annotation in annotations:
            references.add(annotation.reference.display_name)
            loci.add(annotation.dbentity.display_name)
            mutant.add(annotation.mutant.display_name)

            annotation_conds = (
                DBSession.query(PhenotypeannotationCond)
                .filter_by(annotation_id=annotation.annotation_id, condition_class="chemical")
                .all()
            )
            for annotation_cond in annotation_conds:
                chemicals.add(annotation_cond.condition_name)

        qualifier = None
        if phenotype.qualifier:
            qualifier = phenotype.qualifier.display_name

        obj = {
            "name": phenotype.display_name,
            "href": phenotype.obj_url,
            "description": phenotype.description,
            "observable": phenotype.observable.display_name,
            "qualifier": qualifier,
            "references": list(references),
            "phenotype_loci": list(loci),
            "number_annotations": len(list(loci)),
            "chemical": list(chemicals),
            "mutant_type": list(mutant),
            "category": "phenotype",
            "keys": [],
        }

        bulk_data.append({"index": {"_index": INDEX_NAME, "_type": DOC_TYPE, "_id": phenotype.format_name}})

        bulk_data.append(obj)

        if len(bulk_data) == 500:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
    def test_obi_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Obi).all()
        self.assertEqual(0, len(instances))

        obi = factory.ObiFactory()
        instances = DBSession.query(Obi).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(obi, instances[0])
    def test_keywords_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Keyword).all()
        self.assertEqual(0, len(instances))

        keyword = factory.KeywordFactory()
        instances = DBSession.query(Keyword).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(keyword, instances[0])
    def test_edam_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Edam).all()
        self.assertEqual(0, len(instances))

        edam = factory.EdamFactory()
        instances = DBSession.query(Edam).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(edam, instances[0])
    def test_taxonomy_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Taxonomy).all()
        self.assertEqual(0, len(instances))

        taxonomy = factory.TaxonomyFactory()
        instances = DBSession.query(Taxonomy).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(taxonomy, instances[0])
    def test_reporter_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Reporter).all()
        self.assertEqual(0, len(instances))

        reporter = factory.ReporterFactory()
        instances = DBSession.query(Reporter).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(reporter, instances[0])
    def test_filepath_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Filepath).all()
        self.assertEqual(0, len(instances))

        filepath = factory.FilepathFactory()
        instances = DBSession.query(Filepath).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(filepath, instances[0])
    def test_locusdbentity_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Locusdbentity).all()
        self.assertEqual(0, len(instances))

        locus = factory.LocusdbentityFactory()
        instances = DBSession.query(Locusdbentity).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(locus, instances[0])
    def test_apo_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Apo).all()
        self.assertEqual(0, len(instances))

        apo = factory.ApoFactory()
        instances = DBSession.query(Apo).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(apo, instances[0])
    def test_allele_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Allele).all()
        self.assertEqual(0, len(instances))

        allele = factory.AlleleFactory()
        instances = DBSession.query(Allele).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(allele, instances[0])
    def test_book_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Book).all()
        self.assertEqual(0, len(instances))

        book = factory.BookFactory()
        instances = DBSession.query(Book).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(book, instances[0])
    def test_journal_model(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Journal).all()
        self.assertEqual(0, len(instances))

        journal = factory.JournalFactory()
        instances = DBSession.query(Journal).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(journal, instances[0])
    def test_chebiurl_model(self):
        source = factory.SourceFactory()
        chebi = factory.ChebiFactory()
        instances = DBSession.query(ChebiUrl).all()
        self.assertEqual(0, len(instances))

        chebiurl = factory.ChebiUrlFactory()
        instances = DBSession.query(ChebiUrl).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(chebiurl, instances[0])
    def test_colleague_model(self):
        instances = DBSession.query(Colleague).all()
        self.assertEqual(0, len(instances))

        source = factory.SourceFactory()
        colleague = factory.ColleagueFactory()
        instances = DBSession.query(Colleague).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(colleague, instances[0])
        self.assertEqual(colleague.source, source)
    def test_phenotype_model(self):
        source = factory.SourceFactory()
        apo = factory.ApoFactory()
        instances = DBSession.query(Phenotype).all()
        self.assertEqual(0, len(instances))

        pheno = factory.PhenotypeFactory()
        instances = DBSession.query(Phenotype).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(pheno, instances[0])
def index_go_terms():
    go_id_blacklist = load_go_id_blacklist("scripts/go_id_blacklist.lst")

    gos = DBSession.query(Go).all()

    print "Indexing " + str(len(gos) - len(go_id_blacklist)) + " GO terms"

    bulk_data = []
    for go in gos:
        if go.goid in go_id_blacklist:
            continue

        synonyms = DBSession.query(GoAlias.display_name).filter_by(go_id=go.go_id).all()

        references = set([])
        go_loci = set([])
        annotations = DBSession.query(Goannotation).filter_by(go_id=go.go_id).all()
        for annotation in annotations:
            if annotation.go_qualifier != "NOT":
                go_loci.add(annotation.dbentity.display_name)
            references.add(annotation.reference.display_name)

        numerical_id = go.goid.split(":")[1]
        key_values = [go.goid, "GO:" + str(int(numerical_id)), numerical_id, str(int(numerical_id))]

        keys = set([])
        for k in key_values:
            if k is not None:
                keys.add(k.lower())

        obj = {
            "name": go.display_name,
            "href": go.obj_url,
            "description": go.description,
            "synonyms": [s[0] for s in synonyms],
            "go_id": go.goid,
            "go_loci": sorted(list(go_loci)),
            "number_annotations": len(annotations),
            "references": list(references),
            "category": go.go_namespace.replace(" ", "_"),
            "keys": keys,
        }

        bulk_data.append({"index": {"_index": INDEX_NAME, "_type": DOC_TYPE, "_id": go.goid}})

        bulk_data.append(obj)

        if len(bulk_data) == 800:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
    def test_colleague_keywords_model(self):
        source = factory.SourceFactory()
        colleague = factory.ColleagueFactory()
        keyword = factory.KeywordFactory()
        
        instances = DBSession.query(ColleagueKeyword).all()
        self.assertEqual(0, len(instances))

        colleague_keyword = factory.ColleagueKeywordFactory()
        instances = DBSession.query(ColleagueKeyword).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(colleague_keyword, instances[0])
    def test_colleague_association_model(self):
        source = factory.SourceFactory()
        colleague = factory.ColleagueFactory()
        colleague = factory.ColleagueFactory(colleague_id=113699)
        
        instances = DBSession.query(ColleagueAssociation).all()
        self.assertEqual(0, len(instances))

        association = factory.ColleagueAssociationFactory()
        instances = DBSession.query(ColleagueAssociation).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(association, instances[0])
    def test_filekeyword_model(self):
        source = factory.SourceFactory()
        filedbentity = factory.FiledbentityFactory()
        filepath = factory.FilepathFactory()
        edam = factory.EdamFactory()
        keyword = factory.KeywordFactory()
        
        instances = DBSession.query(FileKeyword).all()
        self.assertEqual(0, len(instances))

        fkeyword = factory.FileKeywordFactory()
        instances = DBSession.query(FileKeyword).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(fkeyword, instances[0])
    def test_reference_document_model(self):
        source = factory.SourceFactory()
        journal = factory.JournalFactory()
        book = factory.BookFactory()
        refdbentity = factory.ReferencedbentityFactory()

        instances = DBSession.query(ReferenceDocument).all()
        self.assertEqual(0, len(instances))

        refdoc = factory.ReferenceDocumentFactory()
        
        instances = DBSession.query(ReferenceDocument).all()

        self.assertEqual(1, len(instances))
        self.assertEqual(refdoc, instances[0])
 def test_colleague_model_search_results_doesnt_send_email_if_required(self):
     source = factory.SourceFactory()
     colleague = factory.ColleagueFactory(display_email=0)
     instances = DBSession.query(Colleague).all()
     self.assertEqual(1, len(instances))
     self.assertEqual(colleague, instances[0])
     self.assertNotIn('email', colleague.to_search_results_dict())
 def test_colleague_model_info_dict_doesnt_send_email_if_required(self):
     source = factory.SourceFactory()
     colleague = factory.ColleagueFactory(display_email = 0)
     instances = DBSession.query(Colleague).all()
     colleague_url_1 = factory.ColleagueUrlFactory(colleague_id=colleague.colleague_id)
     colleague_url_2 = factory.ColleagueUrlFactory(colleague_id=colleague.colleague_id, url_type="Lab")
     self.assertEqual(colleague.to_info_dict(), {
         'orcid': colleague.orcid,
         'first_name': colleague.first_name,
         'last_name': colleague.last_name,
         'position': colleague.job_title,
         'profession': colleague.profession,
         'organization': colleague.institution,
         'address': [colleague.address1],
         'city': colleague.city,
         'state': colleague.state,
         'country': colleague.country,
         'postal_code': colleague.postal_code,
         'work_phone': colleague.work_phone,
         'fax': colleague.fax,
         'webpages': {
             'lab_url': colleague_url_2.obj_url,
             'research_summary_url': colleague_url_1.obj_url
         },
         'research_interests': colleague.research_interest,
         'last_update': str(colleague.date_last_modified)
     })
def index_observables():
    observables = DBSession.query(Apo).filter_by(apo_namespace="observable").all()

    print "Indexing " + str(len(observables)) + " observables"
    bulk_data = []

    for observable in observables:
        obj = {
            "name": observable.display_name,
            "href": observable.obj_url,
            "description": observable.description,
            "category": "observable",
            "keys": [],
        }

        bulk_data.append(
            {"index": {"_index": INDEX_NAME, "_type": DOC_TYPE, "_id": "observable_" + str(observable.apo_id)}}
        )

        bulk_data.append(obj)

        if len(bulk_data) == 300:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
    def test_keyword_model_to_dict(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Keyword).all()
        self.assertEqual(0, len(instances))

        keyword = factory.KeywordFactory()

        self.assertEqual(keyword.to_dict(), {'id': keyword.keyword_id, 'name': keyword.display_name})
    def test_edam_model_to_dict(self):
        source = factory.SourceFactory()
        instances = DBSession.query(Edam).all()
        self.assertEqual(0, len(instances))

        edam = factory.EdamFactory()

        self.assertEqual(edam.to_dict(), {'id': edam.edam_id, 'name': edam.format_name})
    def test_colleague_model_should_include_urls_in_dict(self):
        source = factory.SourceFactory()
        colleague = factory.ColleagueFactory()
        instances = DBSession.query(Colleague).all()
        colleague_url_1 = factory.ColleagueUrlFactory(colleague_id=colleague.colleague_id)
        colleague_url_2 = factory.ColleagueUrlFactory(colleague_id=colleague.colleague_id, url_type="Lab")

        colleague_dict = {}
        colleague._include_urls_to_dict(colleague_dict)
        self.assertEqual(colleague_dict, {'webpages': {'lab_url': colleague_url_2.obj_url, 'research_summary_url': colleague_url_1.obj_url}})
Beispiel #31
0
def index_complex_names():
    complexes = DBSession.query(Complexdbentity).all()
    print(("Indexing " + str(len(complexes)) + " complex names"))

    bulk_data = []

    for c in complexes:

        synonyms = DBSession.query(ComplexAlias.display_name).filter_by(
            complex_id=c.dbentity_id).all()

        references = set([])
        refs = DBSession.query(ComplexReference).filter_by(
            complex_id=c.dbentity_id).all()
        for ref in refs:
            references.add(ref.reference.display_name)

        complex_loci = set([])
        annotations = DBSession.query(Complexbindingannotation).filter_by(
            complex_id=c.dbentity_id).all()
        for a in annotations:
            interactor = a.interactor
            if interactor.locus_id is not None:
                complex_loci.add(interactor.locus.display_name)

        key_values = [c.intact_id, c.complex_accession, c.sgdid]

        keys = set([])
        for k in key_values:
            if k is not None:
                keys.add(k.lower())

        obj = {
            "name": c.display_name,
            "complex_name": c.display_name,
            "href": "/complex/" + c.complex_accession,
            "description": c.description + "; " + c.properties,
            "category": "complex",
            "synonyms": [s[0] for s in synonyms],
            "systematic_name": c.systematic_name,
            "intact_id": c.intact_id,
            "complex_accession": c.complex_accession,
            "complex_loci": sorted(list(complex_loci)),
            "references": list(references),
            "keys": list(keys)
        }

        bulk_data.append(
            {"index": {
                "_index": INDEX_NAME,
                "_id": str(uuid.uuid4())
            }})

        bulk_data.append(obj)

        if len(bulk_data) == 800:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
Beispiel #32
0
def index_downloads():
    bulk_data = []
    dbentity_file_obj = IndexESHelper.get_file_dbentity_keyword()
    files = DBSession.query(Filedbentity).filter(
        Filedbentity.is_public == True, Filedbentity.s3_url != None).all()
    print(("indexing " + str(len(files)) + " download files"))
    for x in files:
        try:
            keyword = []
            status = ""
            temp = dbentity_file_obj.get(x.dbentity_id)
            if temp:
                keyword = temp
            if (x.dbentity_status == "Active"
                    or x.dbentity_status == "Archived"):
                if x.dbentity_status == "Active":
                    status = "Active"
                else:
                    status = "Archived"

            obj = {
                "name":
                x.display_name,
                "raw_display_name":
                x.display_name,
                "filename":
                " ".join(x.display_name.split("_")),
                "file_name_format":
                " ".join(x.display_name.split("_")),
                "href":
                x.s3_url if x else None,
                "category":
                "download",
                "description":
                x.description,
                "keyword":
                keyword,
                "format":
                str(x.format.display_name),
                "status":
                str(status),
                "file_size":
                str(IndexESHelper.convertBytes(x.file_size))
                if x.file_size is not None else x.file_size,
                "year":
                str(x.year),
                "readme_url":
                x.readme_file.s3_url if x.readme_file else None,
                "topic":
                x.topic.display_name,
                "data":
                x.data.display_name,
                "path_id":
                x.get_path_id()
            }

            bulk_data.append(
                {"index": {
                    "_index": INDEX_NAME,
                    "_id": str(uuid.uuid4())
                }})

            bulk_data.append(obj)
            if len(bulk_data) == 50:
                es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
                bulk_data = []

        except Exception as e:
            logging.error(e.message)

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
Beispiel #33
0
def index_go_terms():
    go_id_blacklist = load_go_id_blacklist(
        "scripts/search/go_id_blacklist.lst")

    gos = DBSession.query(Go).all()

    print(("Indexing " + str(len(gos) - len(go_id_blacklist)) + " GO terms"))

    bulk_data = []
    for go in gos:
        if go.goid in go_id_blacklist:
            continue

        synonyms = DBSession.query(
            GoAlias.display_name).filter_by(go_id=go.go_id).all()

        references = set([])
        gene_ontology_loci = set([])
        annotations = DBSession.query(Goannotation).filter_by(
            go_id=go.go_id).all()
        for annotation in annotations:
            if annotation.go_qualifier != "NOT":
                gene_ontology_loci.add(annotation.dbentity.display_name)
            references.add(annotation.reference.display_name)

        numerical_id = go.goid.split(":")[1]
        key_values = [
            go.goid, "GO:" + str(int(numerical_id)), numerical_id,
            str(int(numerical_id))
        ]

        keys = set([])
        for k in key_values:
            if k is not None:
                keys.add(k.lower())

        obj = {
            "name": go.display_name,
            "go_name": go.display_name,
            "href": go.obj_url,
            "description": go.description,
            "synonyms": [s[0] for s in synonyms],
            "go_id": go.goid,
            "gene_ontology_loci": sorted(list(gene_ontology_loci)),
            "number_annotations": len(annotations),
            "references": list(references),
            "category": go.go_namespace.replace(" ", "_"),
            "keys": list(keys)
        }

        bulk_data.append(
            {"index": {
                "_index": INDEX_NAME,
                "_id": str(uuid.uuid4())
            }})

        bulk_data.append(obj)

        if len(bulk_data) == 800:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
Beispiel #34
0
def index_references():
    _ref_loci = IndexESHelper.get_dbentity_locus_note()
    _references = DBSession.query(Referencedbentity).all()
    _abstracts = IndexESHelper.get_ref_abstracts()
    _authors = IndexESHelper.get_ref_authors()
    _aliases = IndexESHelper.get_ref_aliases()

    bulk_data = []
    print(("Indexing " + str(len(_references)) + " references"))

    for reference in _references:
        reference_loci = []
        if len(_ref_loci) > 0:
            temp_loci = _ref_loci.get(reference.dbentity_id)
            if temp_loci is not None:
                reference_loci = list(
                    set([
                        x.display_name
                        for x in IndexESHelper.flattern_list(temp_loci)
                    ]))

        abstract = _abstracts.get(reference.dbentity_id)
        if abstract is not None:
            abstract = abstract[0]
        sec_sgdids = _aliases.get(reference.dbentity_id)
        sec_sgdid = None
        authors = _authors.get(reference.dbentity_id)
        if sec_sgdids is not None:
            sec_sgdid = sec_sgdids[0]

        if authors is None:
            authors = []

        journal = reference.journal
        if journal:
            journal = journal.display_name
        key_values = [
            reference.pmcid, reference.pmid, "pmid: " + str(reference.pmid),
            "pmid:" + str(reference.pmid), "pmid " + str(reference.pmid),
            reference.sgdid
        ]

        keys = set([])
        for k in key_values:
            if k is not None:
                keys.add(str(k).lower())
        obj = {
            "name": reference.citation,
            "reference_name": reference.citation,
            "href": reference.obj_url,
            "description": abstract,
            "author": authors,
            "journal": journal,
            "year": str(reference.year),
            "reference_loci": reference_loci,
            "secondary_sgdid": sec_sgdid,
            "category": "reference",
            "keys": list(keys)
        }

        bulk_data.append(
            {"index": {
                "_index": INDEX_NAME,
                "_id": str(uuid.uuid4())
            }})
        bulk_data.append(obj)
        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
Beispiel #35
0
def load_tsv_filedbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    # open ssh connection to download server
    client = paramiko.SSHClient()
    client.load_system_host_keys()
    username = input('Username for legacy download server: ')
    password =  getpass.getpass('Password for %s@%s: ' % (username, HOSTNAME))
    client.connect(HOSTNAME, 22, username, password, gss_auth=False, gss_kex=False)
    sftp_client = client.open_sftp()

    f =open(INPUT_FILE_NAME)
    i = 0
    for line in f:
        val = line.split("\t")
        if val[0] == 'bun path':
            continue
        if len(val) > 0:
            i = i + 1
            ### added by Shuai
            if len(val) < 14:
                print(val)
                return
            ### 
            raw_date = val[13]
            if len(raw_date):
                raw_date = datetime.strptime(val[13], '%Y-%m-%d')
            else:
                raw_date = None
            raw_status = val[4].strip()
            if raw_status == 'Archive':
                raw_status = 'Archived'

            bun_path = val[0].strip()
            new_path = val[1].strip()
            if new_path.startswith("datasets/"):
                new_path = "/datasets"
            if bun_path[0] != '/':
                bun_path = bun_path.replace('genome-sequences/', '/genome-sequences/')
            if new_path[0] != '/':
                new_path = new_path.replace('genome-sequences/', '/genome-sequences/')
            readme_file = val[18]
            obj = {
                'bun_path': bun_path,
                'new_path': new_path,
                'display_name': val[3].strip(),
                'status': raw_status,
                'source': val[5].strip(),
                'topic_edam_id': val[7].upper().replace('TOPIC', 'EDAM').strip(),
                'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(),
                'format_edam_id': val[11].upper().replace('FORMAT', 'EDAM').strip(),
                'file_extension': val[12].strip(),
                'file_date': raw_date,
                'is_public': (val[15] == '1'),
                'is_in_spell': (val[16] == '1'),
                'is_in_browser': (val[17] == '1'),
                'readme_name': readme_file,
                'description': val[19].decode('utf-8', 'ignore').replace('"', ''),
                'pmids': val[20],
                'keywords': val[21].replace('"', '')
            }
            create_and_upload_file(obj, i, sftp_client)
    client.close()
Beispiel #36
0
def index_genes():
    # Indexing just the S228C genes
    # dbentity: 1364643 (id) -> straindbentity -> 274901 (taxonomy_id)
    # list of dbentities comes from table DNASequenceAnnotation with taxonomy_id 274901
    # feature_type comes from DNASequenceAnnotation as well
    gene_ids_so = DBSession.query(
        Dnasequenceannotation.dbentity_id, Dnasequenceannotation.so_id).filter(
            Dnasequenceannotation.taxonomy_id == 274901).all()
    dbentity_ids_to_so = {}
    dbentity_ids = set([])
    so_ids = set([])
    for gis in gene_ids_so:
        dbentity_ids.add(gis[0])
        so_ids.add(gis[1])
        dbentity_ids_to_so[gis[0]] = gis[1]
    # add some non S288C genes
    not_s288c = DBSession.query(Locusdbentity.dbentity_id).filter(
        Locusdbentity.not_in_s288c == True).all()
    for id in not_s288c:
        dbentity_ids.add(id[0])
        # assume non S288C features to be ORFs
        dbentity_ids_to_so[id[0]] = 263757
    all_genes = DBSession.query(Locusdbentity).filter(
        Locusdbentity.dbentity_id.in_(list(dbentity_ids))).all()

    # make list of merged/deleted genes so they don"t redirect when they show up as an alias
    merged_deleted_r = DBSession.query(Locusdbentity.format_name).filter(
        Locusdbentity.dbentity_status.in_(["Merged", "Deleted"])).all()
    merged_deleted = [d[0] for d in merged_deleted_r]

    feature_types_db = DBSession.query(So.so_id, So.display_name).filter(
        So.so_id.in_(list(so_ids))).all()
    feature_types = {}
    for ft in feature_types_db:
        feature_types[ft[0]] = ft[1]

    tc_numbers_db = DBSession.query(LocusAlias).filter_by(
        alias_type="TC number").all()
    tc_numbers = {}
    for tc in tc_numbers_db:
        if tc.locus_id in tc_numbers:
            tc_numbers[tc.locus_id].append(tc.display_name)
        else:
            tc_numbers[tc.locus_id] = [tc.display_name]

    ec_numbers_db = DBSession.query(LocusAlias).filter_by(
        alias_type="EC number").all()
    ec_numbers = {}
    for ec in ec_numbers_db:
        if ec.locus_id in ec_numbers:
            ec_numbers[ec.locus_id].append(ec.display_name)
        else:
            ec_numbers[ec.locus_id] = [ec.display_name]

    secondary_db = DBSession.query(LocusAlias).filter_by(
        alias_type="SGDID Secondary").all()
    secondary_sgdids = {}

    for sid in secondary_db:
        if sid.locus_id in secondary_sgdids:
            secondary_sgdids[sid.locus_id].append(sid.display_name)
        else:
            secondary_sgdids[sid.locus_id] = [sid.display_name]

    bulk_data = []

    print(("Indexing " + str(len(all_genes)) + " genes"))
    ##### test newer methods ##########
    _summary = IndexESHelper.get_locus_dbentity_summary()
    _protein = IndexESHelper.get_locus_dbentity_alias(["NCBI protein name"])
    _phenos = IndexESHelper.get_locus_phenotypeannotation()
    _goids = IndexESHelper.get_locus_go_annotation()
    _aliases_raw = IndexESHelper.get_locus_dbentity_alias(
        ["Uniform", "Non-uniform", "Retired name", "UniProtKB ID"])

    ###################################
    not_mapped_genes = IndexESHelper.get_not_mapped_genes()
    is_quick_flag = True

    for gene in all_genes:
        if gene.gene_name:
            _name = gene.gene_name
            if gene.systematic_name and gene.gene_name != gene.systematic_name:
                _name += " / " + gene.systematic_name
        else:
            _name = gene.systematic_name
            _systematic_name = gene.systematic_name

        #summary = DBSession.query(Locussummary.text).filter_by(locus_id=gene.dbentity_id).all()
        summary = []
        if (_summary is not None):
            summary = _summary.get(gene.dbentity_id)
        #protein = DBSession.query(LocusAlias.display_name).filter_by(locus_id=gene.dbentity_id, alias_type="NCBI protein name").one_or_none()
        protein = _protein.get(gene.dbentity_id)
        if protein is not None:
            protein = protein[0].display_name

        # TEMP don"t index due to schema schange
        # sequence_history = DBSession.query(Locusnoteannotation.note).filter_by(dbentity_id=gene.dbentity_id, note_type="Sequence").all()
        # gene_history = DBSession.query(Locusnoteannotation.note).filter_by(dbentity_id=gene.dbentity_id, note_type="Locus").all()

        #phenotype_ids = DBSession.query(Phenotypeannotation.phenotype_id).filter_by(dbentity_id=gene.dbentity_id).all()
        phenotype_ids = []
        if _phenos is not None:
            temp = _phenos.get(gene.dbentity_id)
            if temp is not None:
                phenotype_ids = [x.phenotype_id for x in temp]
        if len(phenotype_ids) > 0:
            phenotypes = DBSession.query(Phenotype.display_name).filter(
                Phenotype.phenotype_id.in_(phenotype_ids)).all()
        else:
            phenotypes = []
        #go_ids = DBSession.query(Goannotation.go_id).filter(and_(Goannotation.go_qualifier != "NOT", Goannotation.dbentity_id == gene.dbentity_id)).all()
        go_ids = _goids.get(gene.dbentity_id)
        if go_ids is not None:
            go_ids = [x.go_id for x in go_ids]
        else:
            go_ids = []
        go_annotations = {
            "cellular component": set([]),
            "molecular function": set([]),
            "biological process": set([])
        }
        if len(go_ids) > 0:
            #go_ids = [g[0] for g in go_ids]
            go = DBSession.query(Go.display_name, Go.go_namespace).filter(
                Go.go_id.in_(go_ids)).all()
            for g in go:
                go_annotations[g[1]].add(g[0] + " (direct)")
        go_slim_ids = DBSession.query(Goslimannotation.goslim_id).filter(
            Goslimannotation.dbentity_id == gene.dbentity_id).all()
        if len(go_slim_ids) > 0:
            go_slim_ids = [g[0] for g in go_slim_ids]
            go_slim = DBSession.query(
                Goslim.go_id, Goslim.display_name).filter(
                    Goslim.goslim_id.in_(go_slim_ids)).all()
            go_ids = [g[0] for g in go_slim]
            go = DBSession.query(Go.go_id, Go.go_namespace).filter(
                Go.go_id.in_(go_ids)).all()
            for g in go:
                for gs in go_slim:
                    if (gs[0] == g[0]):
                        go_annotations[g[1]].add(gs[1])

        # add "quick direct" keys such as aliases, SGD, UniProt ID and format aliases
        #aliases_raw = DBSession.query(LocusAlias.display_name, LocusAlias.alias_type).filter(and_(LocusAlias.locus_id==gene.dbentity_id, LocusAlias.alias_type.in_())).all()
        aliases_raw = _aliases_raw.get(gene.dbentity_id)
        alias_quick_direct_keys = []
        aliases = []
        if aliases_raw is not None:
            for alias_item in aliases_raw:
                name = alias_item.display_name
                if name not in merged_deleted:
                    alias_quick_direct_keys.append(name)
                if alias_item.alias_type != "UniProtKB ID":
                    aliases.append(name)
        '''for d in aliases_raw:
            name = d[0]
            if name not in merged_deleted:
                alias_quick_direct_keys.append(name)
            if d[1] != "UniProtKB ID":
                aliases.append(name)'''
        # make everything in keys lowercase to ignore case
        keys = []
        _keys = [gene.gene_name, gene.systematic_name, gene.sgdid
                 ] + alias_quick_direct_keys
        # Add SGD:<gene SGDID> to list of keywords for quick search
        _keys.append("SGD:{}".format(gene.sgdid))
        # If this gene has a reservedname associated with it, add that reservedname to
        # the list of keywords used for the quick search of this gene
        reservedname = DBSession.query(Reservedname).filter_by(
            locus_id=gene.dbentity_id).one_or_none()
        if reservedname:
            _keys.append(reservedname.display_name)
        for k in _keys:
            if k:
                keys.append(k.lower())

        obj = {
            "name":
            _name,
            "locus_name":
            _name,
            "sys_name":
            _systematic_name,
            "href":
            gene.obj_url,
            "description":
            gene.description,
            "category":
            "locus",
            "feature_type":
            feature_types[dbentity_ids_to_so[gene.dbentity_id]],
            "name_description":
            gene.name_description,
            "summary":
            summary,
            "locus_summary":
            "summary",
            "phenotypes": [p[0] for p in phenotypes],
            "aliases":
            aliases,
            "cellular_component":
            list(go_annotations["cellular component"] - set([
                "cellular component", "cellular component (direct)",
                "cellular_component", "cellular_component (direct)"
            ])),
            "biological_process":
            list(go_annotations["biological process"] - set([
                "biological process (direct)", "biological process",
                "biological_process (direct)", "biological_process"
            ])),
            "molecular_function":
            list(go_annotations["molecular function"] - set([
                "molecular function (direct)", "molecular function",
                "molecular_function (direct)", "molecular_function"
            ])),
            "ec_number":
            ec_numbers.get(gene.dbentity_id),
            "protein":
            protein,
            "tc_number":
            tc_numbers.get(gene.dbentity_id),
            "secondary_sgdid":
            secondary_sgdids.get(gene.dbentity_id),
            "status":
            gene.dbentity_status,
            # TEMP don"t index due to schema change
            # "sequence_history": [s[0] for s in sequence_history],
            # "gene_history": [g[0] for g in gene_history],
            "bioentity_id":
            gene.dbentity_id,
            "keys":
            list(keys),
            "is_quick_flag":
            str(is_quick_flag)
        }

        bulk_data.append(
            {"index": {
                "_index": INDEX_NAME,
                "_id": str(uuid.uuid4())
            }})

        bulk_data.append(obj)

        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def update_database_load_file_to_s3(nex_session, data_file, gzip_file,
                                    source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/ncbi/data/", "")

    nex_session.query(Dbentity).filter(
        Dbentity.display_name.like('RNAcentral.%.json.gz')).filter(
            Dbentity.dbentity_status == 'Active').update(
                {"dbentity_status": 'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3495')  # data:3495    RNA sequence
    topic_id = edam_to_id.get('EDAM:0099')  # topic:0099   RNA
    format_id = edam_to_id.get('EDAM:3464')  # format:3464  JSON format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='JSON file for yeast RNA genes',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=None,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    rnaFile = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()

    if rnaFile is None:
        log.info("The " + gzip_file + " is not in the database.")
        return

    file_id = rnaFile.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info(
            "The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + data_file)
Beispiel #38
0
def update_database_load_file_to_s3(nex_session, gaf_file, is_public, source_to_id, edam_to_id, datestamp):

    # gene_association.sgd.20171204.gaf.gz
    # gene_association.sgd-yeastmine.20171204.gaf.gz
 
    # datestamp = str(datetime.now()).split(" ")[0].replace("-", "")
    gzip_file = gaf_file + "." + datestamp + ".gaf.gz"
    import gzip
    import shutil
    with open(gaf_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    local_file = open(gzip_file, mode='rb')

    ### upload a current GAF file to S3 with a static URL for Go Community ###
    if is_public == '1':
        upload_gaf_to_s3(local_file, "latest/gene_association.sgd.gaf.gz")
    ##########################################################################

    import hashlib
    gaf_md5sum = hashlib.md5(gaf_file.encode()).hexdigest()

    row = nex_session.query(Filedbentity).filter_by(md5sum = gaf_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    # nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update({"dbentity_status": 'Archived'})

    if is_public == '1':
        nex_session.query(Dbentity).filter(Dbentity.display_name.like('gene_association.sgd%')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch')
        nex_session.commit()

    data_id = edam_to_id.get('EDAM:2048')   ## data:2048 Report
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV

    if "yeastmine" not in gaf_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)
    
    readme = nex_session.query(Dbentity).filter_by(display_name="gene_association.README", dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("gene_association.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id
 
    # path.path = /reports/function

    upload_file(CREATED_BY, local_file,
                filename=gzip_file,
                file_extension='gz',
                description='All GO annotations for yeast genes (protein and RNA) in GAF file format',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public=is_public,
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gaf_md5sum)

    gaf = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none()
    if gaf is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gaf.dbentity_id

    path = nex_session.query(Path).filter_by(path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id = file_id,
                 path_id = path_id,
                 source_id = source_to_id['SGD'],
                 created_by = CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + gaf_file)
def load_csv_filedbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    # open ssh connection to download server
    client = paramiko.SSHClient()
    client.load_system_host_keys()
    username = raw_input('Username for legacy download server: ')
    password = getpass.getpass('Password for %s@%s: ' % (username, HOSTNAME))
    client.connect(HOSTNAME,
                   22,
                   username,
                   password,
                   gss_auth=False,
                   gss_kex=False)
    sftp_client = client.open_sftp()

    o = open(INPUT_FILE_NAME, 'rU')
    reader = csv.reader(o)
    for i, val in enumerate(reader):
        if i > 0:
            if val[0] == '':
                logging.info('Found a blank value, DONE!')
                return

            ### added by Shuai
            if len(val) < 14:
                print(val)
                return
            ###
            raw_date = val[13]
            if len(raw_date):
                temp = format_csv_date_string(val[13])
                if temp is not None:
                    raw_date = datetime.strptime(temp, '%Y-%m-%d')
                else:
                    raw_date = datetime.strptime(val[13], '%Y-%m-%d')

            else:
                raw_date = None
            raw_status = val[4].strip()
            if raw_status == 'Archive':
                raw_status = 'Archived'

            bun_path = val[0].strip()
            new_path = val[1].strip()
            if bun_path[0] != '/':
                bun_path = bun_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            if new_path[0] != '/':
                new_path = new_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            obj = {
                'bun_path': bun_path,
                'new_path': new_path,
                'display_name': val[3].strip(),
                'status': raw_status,
                'source': val[5].strip(),
                'topic_edam_id': val[7].upper().replace('TOPIC',
                                                        'EDAM').strip(),
                'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(),
                'format_edam_id': val[11].upper().replace('FORMAT',
                                                          'EDAM').strip(),
                'file_extension': val[12].strip(),
                'file_date': raw_date,
                'is_public': (val[15] == '1'),
                'is_in_spell': (val[16] == '1'),
                'is_in_browser': (val[17] == '1'),
                'readme_name': val[18],
                'description': val[19].decode('utf-8', 'ignore'),
                'pmids': val[20],
                'keywords': val[21]
            }
            create_and_upload_file(obj, i, sftp_client)
    client.close()
Beispiel #40
0
 def query_objects(self, context):
     service = self.parent.selected_object(context)
     return DBSession.query(Payment).filter(
         Payment.taxation_service_id == service.id).all()
Beispiel #41
0
 def query_objects(self, context):
     club = self.parent.selected_object(context)
     return DBSession.query(Dog).filter(
         Dog.dog_training_club_id == club.id).all()
def update_database_load_file_to_s3(nex_session, gzip_file, source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    file_md5sum = hashlib.md5(local_file.read()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(md5sum = file_md5sum).one_or_none()

    if row is not None:
        return

    if "tbl" in gzip_file:
        nex_session.query(Dbentity).filter(Dbentity.display_name.like('ncbi_tbl_files.%.tar.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch')
    elif "sqn" in gzip_file:
        nex_session.query(Dbentity).filter(Dbentity.display_name.like('ncbi_sqn_files.%.tar.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch')
    else:
        nex_session.query(Dbentity).filter(Dbentity.display_name.like('ncbi_gbf_files.%.tar.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch')
    
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')   ## data:3671 Text 
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3507') ## format:3507 Document format 

    if "tbl" in gzip_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)
    
    # readme = nex_session.query(Dbentity).filter_by(display_name="ncbi_tab_files.README", dbentity_status='Active').one_or_none()
    # if readme is None:
    #    log.info("ncbi_tbl_files.README is not in the database.")
    #    return
    # readme_file_id = readme.dbentity_id

    readme_file_id = None

    # path.path = /reports/function

    upload_file(CREATED_BY, local_file,
                filename=gzip_file,
                file_extension='gz',
                description='All yeast features in tbl file format',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=file_md5sum)

    file = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none()
    if file is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = file.dbentity_id

    path = nex_session.query(Path).filter_by(path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id = file_id,
                 path_id = path_id,
                 source_id = source_to_id['SGD'],
                 created_by = CREATED_BY)

    nex_session.add(x)
    nex_session.commit()
 def get_readme_file(cls, id):
     _data = DBSession.query(Filedbentity).filter_by(Filedbentity.dbentity_id == id).all()
Beispiel #44
0
 def query_objects(self, context):
     return DBSession.query(TaxationService).all()
def update_database_load_file_to_s3(nex_session, gff_file, gzip_file, source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')

    ### upload a current GFF file to S3 with a static URL for Go Community ###
    upload_gff_to_s3(local_file, "latest/saccharomyces_cerevisiae.gff.gz")
    ##########################################################################

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    nex_session.query(Dbentity).filter(Dbentity.display_name.like('saccharomyces_cerevisiae.%.gff.gz')).filter(
        Dbentity.dbentity_status == 'Active').update({"dbentity_status": 'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')  # data:3671    Text
    # topic:3068   Literature and language
    topic_id = edam_to_id.get('EDAM:3068')
    format_id = edam_to_id.get('EDAM:3507')  # format:3507  Document format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    readme = nex_session.query(Dbentity).filter_by(
        display_name="saccharomyces_cerevisiae_gff.README", dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("saccharomyces_cerevisiae_gff.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id

    # path.path = /reports/chromosomal-features

    upload_file(CREATED_BY, local_file,
                filename=gzip_file,
                file_extension='gz',
                description='GFF file for yeast genes (protein and RNA)',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    gff = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()

    if gff is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gff.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info("The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + gff_file)
Beispiel #46
0
def load_csv_filedbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    o = open(INPUT_FILE_NAME, 'rU')
    reader = csv.reader(o)
    for i, val in enumerate(reader):
        if i > 0:

            ### added by Shuai
            if len(val) == 0:
                continue

            if val[0] == '':
                logging.info('Found a blank value, DONE!')
                return

            ### added by Shuai
            if len(val) < 14:
                print(val)
                return
            ###
            raw_date = val[13]
            if len(raw_date):
                temp = format_csv_date_string(val[13])
                if temp is not None:
                    raw_date = datetime.strptime(temp, '%Y-%m-%d')
                else:
                    raw_date = datetime.strptime(val[13], '%Y-%m-%d')

            else:
                raw_date = None
            raw_status = val[4].strip()
            if raw_status == 'Archive':
                raw_status = 'Archived'

            bun_path = val[0].strip()
            new_path = val[1].strip()
            if bun_path[0] != '/':
                bun_path = bun_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            if new_path[0] != '/':
                new_path = new_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            readme_file = val[18]
            obj = {
                'bun_path': bun_path,
                'new_path': new_path,
                'display_name': val[3].strip(),
                'status': raw_status,
                'source': val[5].strip(),
                'topic_edam_id': val[7].upper().replace('TOPIC',
                                                        'EDAM').strip(),
                'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(),
                'format_edam_id': val[11].upper().replace('FORMAT',
                                                          'EDAM').strip(),
                'file_extension': val[12].strip(),
                'file_date': raw_date,
                'is_public': (val[15] == '1'),
                'is_in_spell': (val[16] == '1'),
                'is_in_browser': (val[17] == '1'),
                'readme_name': readme_file,
                'description': val[19].decode('utf-8',
                                              'ignore').replace('"', ''),
                'pmids': val[20],
                'keywords': val[21].replace('"', '')
            }
            create_and_upload_file(obj, i)
Beispiel #47
0
from sqlalchemy import create_engine, and_
from elasticsearch import Elasticsearch
# from mapping import mapping
from es7_mapping import mapping
import os
import requests
from threading import Thread
import json
import collections
from index_es_helpers import IndexESHelper
import concurrent.futures
import uuid
import logging

engine = create_engine(os.environ["NEX2_URI"], pool_recycle=3600)
DBSession.configure(bind=engine)
Base.metadata.bind = engine

INDEX_NAME = os.environ.get("ES_INDEX_NAME", "searchable_items_aws")
DOC_TYPE = "searchable_item"
ES_URI = os.environ["WRITE_ES_URI"]
es = Elasticsearch(ES_URI, retry_on_timeout=True)


def delete_mapping():
    print("Deleting mapping...")
    response = requests.delete(ES_URI + INDEX_NAME + "/")
    if response.status_code != 200:
        print(("ERROR: " + str(response.json())))
    else:
        print("SUCCESS")
Beispiel #48
0
def insert_author_response(request):

    try:
        sgd = DBSession.query(Source).filter_by(display_name='Direct submission').one_or_none()
        source_id = sgd.source_id
        created_by = 'OTTO'

        email = request.params.get('email')
        if email == '':
            return HTTPBadRequest(body=json.dumps({'error': "Please enter your email address."}), content_type='text/json')
        is_email_valid = validate_email(email, verify=False)
        if not is_email_valid:
            msg = email + ' is not a valid email.'
            return HTTPBadRequest(body=json.dumps({'error': msg}), content_type='text/json')

        pmid = request.params.get('pmid')
        pmid = pmid.replace('PMID:', '').replace('Pubmed ID:', '').strip()
        if pmid == '':
            return HTTPBadRequest(body=json.dumps({'error': "Please enter Pubmed ID for your paper."}), content_type='text/json')
        if pmid.isdigit():
            pmid = int(pmid)
        else:
            return HTTPBadRequest(body=json.dumps({'error': "Please enter a number for Pubmed ID."}), content_type='text/json')

        x = DBSession.query(Authorresponse).filter_by(author_email=email, pmid=int(pmid)).one_or_none()
        if x is not None:
            return HTTPBadRequest(body=json.dumps({'error': "You have already subomitted info for PMID:" + str(pmid)+"."}), content_type='text/json')

        has_novel_research = '0'
        if request.params.get('has_novel_research'):
            has_novel_research = '1'
        has_large_scale_data = '0'
        if request.params.get('has_large_scale_data'):
            has_large_scale_data = '1'

        research_results = request.params.get('research_result')
        dataset_description = request.params.get('dataset_desc')
        gene_list = request.params.get('genes')
        other_description = request.params.get('other_desc')

        x = Authorresponse(source_id = source_id,
                           pmid = pmid,
                           author_email = email,
                           has_novel_research = has_novel_research,
                           has_large_scale_data = has_large_scale_data,
                           has_fast_track_tag = '0',
                           curator_checked_datasets = '0',
                           curator_checked_genelist = '0',
                           no_action_required = '0',
                           research_results = research_results,
                           gene_list = gene_list,
                           dataset_description = dataset_description,
                           other_description = other_description,
                           created_by = created_by)

        DBSession.add(x)
        transaction.commit()
        return {'curation_id': 0}
    except Exception as e:
        transaction.abort()
        return HTTPBadRequest(body=json.dumps({'error': "ERROR: " + str(e)}), content_type='text/json')
Beispiel #49
0
 def set_invaild_token(cls, token):
     row = DBSession().query(cls).filter(cls.token == token).first()
     if not row:
         row.status = 0
         row.save()