Exemple #1
0
    def strain_paragraph_starter():
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        key_to_strain = dict([(x.unique_key(), x) for x in nex_session.query(Strain).all()])

        # Strain
        for strain_key, paragraph in strain_paragraphs.iteritems():
            if strain_key in key_to_strain:
                text = paragraph[0]
                html = link_gene_names(text, {"HO"}, nex_session)
                html = link_strain_names(html, {key_to_strain[strain_key].display_name}, nex_session)
                yield {"source": key_to_source["SGD"], "text": text, "html": html, "strain": key_to_strain[strain_key]}
            else:
                print "Strain not found: " + str(strain_key)
                yield None

        nex_session.close()
Exemple #2
0
    def reference_paragraph_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        id_to_reference = dict([(x.id, x) for x in nex_session.query(Reference).all()])

        for old_abstract in make_db_starter(bud_session.query(Abstract), 1000)():
            reference_id = old_abstract.reference_id
            if reference_id in id_to_reference:
                yield {
                    'source': key_to_source['SGD'],
                    'text': old_abstract.text,
                    'html': link_gene_names(old_abstract.text, set(), nex_session),
                    'reference': id_to_reference[reference_id],
                }
            else:
                print 'Reference not found: ' + str(reference_id)
                yield None

        bud_session.close()
        nex_session.close()
Exemple #3
0
    def strain_paragraph_starter():
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        key_to_strain = dict([(x.unique_key(), x) for x in nex_session.query(Strain).all()])

        #Strain
        for strain_key, paragraph in strain_paragraphs.iteritems():
            if strain_key in key_to_strain:
                text = paragraph[0]
                html = link_gene_names(text, {'HO'}, nex_session)
                html = link_strain_names(html, {key_to_strain[strain_key].display_name}, nex_session)
                yield {
                    'source': key_to_source['SGD'],
                    'text': text,
                    'html': html,
                    'strain': key_to_strain[strain_key],
                }
            else:
                print 'Strain not found: ' + str(strain_key)
                yield None

        nex_session.close()
Exemple #4
0
    def bioentity_paragraph_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()])
        id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()])
        sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()])
        sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()])
        goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()])

        #LSP
        for feature in bud_session.query(Feature).all():
            paragraph_feats = feature.paragraph_feats
            if len(paragraph_feats) > 0 and feature.id in id_to_bioentity:
                paragraph_feats.sort(key=lambda x: x.order)
                paragraph_html, paragraph_text = clean_paragraph(id_to_bioentity[feature.id], '<p>' + ('</p><p>'.join([x.paragraph.text for x in paragraph_feats])) + '</p>', str([x.paragraph.id for x in paragraph_feats]), sgdid_to_reference, sgdid_to_bioentity, goid_to_go)
                
                date_edited = None
                year = 0
                month = 0
                day = 0
                for paragraph_feat in paragraph_feats:
                    my_date = paragraph_feat.paragraph.date_edited
                    this_date = str(my_date).split(' ')[0].replace('-0', '-').split('-')
                    this_year = int(this_date[0])
                    this_month = int(this_date[1])
                    this_day = int(this_date[2])
                    if date_edited is None or datetime(this_year, this_month, this_day) > datetime(year, month, day):
                        date_edited = my_date
                        year = this_year
                        month = this_month
                        day = this_day

                yield {
                    'bioentity': id_to_bioentity[feature.id],
                    'source': key_to_source['SGD'],
                    'text': paragraph_text,
                    'html': paragraph_html,
                    'date_edited': date_edited,
                    'date_created': paragraph_feats[0].paragraph.date_created,
                    'created_by': paragraph_feats[0].paragraph.created_by,
                    'category': 'LSP'
                }

        bioentity_key_to_date = dict()
        #Go
        for gofeature in bud_session.query(GoFeature).all():
            bioentity_key = (gofeature.feature.name, 'LOCUS')
            if gofeature.annotation_type == 'manually curated' and bioentity_key not in bioentity_key_to_date:
                bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed

        for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems():
            if bioentity_key in key_to_bioentity:
                yield {
                    'bioentity': key_to_bioentity[bioentity_key],
                    'source': key_to_source['SGD'],
                    'text': str(date_last_reviewed),
                    'html': str(date_last_reviewed),
                    'date_created': None,
                    'created_by': None,
                    'category': 'GODATE'
                }
            else:
                #print 'Bioentity not found: ' + str(bioentity_key)
                yield None

        for pieces in make_file_starter('src/sgd/convert/data/gp_information.559292_sgd')():
            if len(pieces) >= 8:
                sgdid = pieces[8]
                if sgdid.startswith('SGD:'):
                    sgdid = sgdid[4:]
                    go_annotation = [x[22:].strip() for x in pieces[9].split('|') if x.startswith('go_annotation_summary')]
                    if len(go_annotation) == 1:
                        if sgdid in sgdid_to_bioentity:
                            yield {
                                'bioentity': sgdid_to_bioentity[sgdid],
                                'source': key_to_source['SGD'],
                                'text': go_annotation[0],
                                'html': go_annotation[0],
                                'date_created': None,
                                'created_by': None,
                                'category': 'GO'
                            }
                        else:
                            print 'Bioentity not found: ' + sgdid
                            yield None

        #Regulation
        file_names = ['src/sgd/convert/data/regulationSummaries',
                      'src/sgd/convert/data/15-8regulationSummaries.txt',
                      'src/sgd/convert/data/15-9regulationSummaries.txt',
                      'src/sgd/convert/data/15-10regulationSummaries.txt',
                      'src/sgd/convert/data/15-11regulationSummaries.txt',
                      'src/sgd/convert/data/16-1regulationSummaries.txt',
                      'src/sgd/convert/data/16-2regulationSummaries.txt',
                      'src/sgd/convert/data/16-3regulationSummaries.txt',
                      'src/sgd/convert/data/16-4regulationSummaries.txt',
                      'src/sgd/convert/data/16-5regulationSummaries.txt']

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], 'LOCUS')

                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        'bioentity': bioentity,
                        'source': key_to_source['SGD'],
                        'text': row[2],
                        'html': link_gene_names(row[2], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session),
                        'category': 'REGULATION'
                    }
                else:
                    #print 'Bioentity not found: ' + str(bioentity_key)
                    yield None

        #Phenotype
        file_names = ['src/sgd/convert/data/PhenotypeSummaries032015.txt',
                      'src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt',
                      'src/sgd/convert/data/15-7phenoSummaries.txt',
                      'src/sgd/convert/data/15-8phenoSummaries.txt',
                      'src/sgd/convert/data/15-9phenoSummaries.txt',
                      'src/sgd/convert/data/15-10phenoSummaries.txt',
                      'src/sgd/convert/data/15-11phenoSummaries.txt',
                      'src/sgd/convert/data/15-12phenoSummaries.txt',
                      'src/sgd/convert/data/16-1phenoSummaries.txt',
                      'src/sgd/convert/data/16-2phenoSummaries.txt',
                      'src/sgd/convert/data/16-3phenoSummaries.txt',
                      'src/sgd/convert/data/16-4phenoSummaries.txt',
                      'src/sgd/convert/data/16-5phenoSummaries.txt',
                      'src/sgd/convert/data/16-6phenoSummaries.txt',
                      'src/sgd/convert/data/16-7phenoSummaries.txt',
                      'src/sgd/convert/data/16-9phenoSummaries.txt',
                      'src/sgd/convert/data/16-10phenoSummaries.txt']

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], 'LOCUS')
                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        'bioentity': bioentity,
                        'source': key_to_source['SGD'],
                        'text': row[1],
                        'html': link_gene_names(row[1], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session),
                        'category': 'PHENOTYPE'
                        }
                else:
                    #print 'Bioentity not found: ' + str(bioentity_key)
                    yield None


        bud_session.close()
        nex_session.close()
Exemple #5
0
    def bioentity_paragraph_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()])
        id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()])
        sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()])
        sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()])
        goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()])

        # LSP
        for feature in bud_session.query(Feature).all():
            paragraph_feats = feature.paragraph_feats
            if len(paragraph_feats) > 0 and feature.id in id_to_bioentity:
                paragraph_feats.sort(key=lambda x: x.order)
                paragraph_html, paragraph_text = clean_paragraph(
                    id_to_bioentity[feature.id],
                    "<p>" + ("</p><p>".join([x.paragraph.text for x in paragraph_feats])) + "</p>",
                    str([x.paragraph.id for x in paragraph_feats]),
                    sgdid_to_reference,
                    sgdid_to_bioentity,
                    goid_to_go,
                )
                yield {
                    "bioentity": id_to_bioentity[feature.id],
                    "source": key_to_source["SGD"],
                    "text": paragraph_text,
                    "html": paragraph_html,
                    "date_edited": paragraph_feats[0].paragraph.date_edited,
                    "date_created": paragraph_feats[0].paragraph.date_created,
                    "created_by": paragraph_feats[0].paragraph.created_by,
                    "category": "LSP",
                }

        bioentity_key_to_date = dict()
        # Go
        for gofeature in bud_session.query(GoFeature).all():
            bioentity_key = (gofeature.feature.name, "LOCUS")
            if gofeature.annotation_type == "manually curated" and bioentity_key not in bioentity_key_to_date:
                bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed

        for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems():
            if bioentity_key in key_to_bioentity:
                yield {
                    "bioentity": key_to_bioentity[bioentity_key],
                    "source": key_to_source["SGD"],
                    "text": str(date_last_reviewed),
                    "html": str(date_last_reviewed),
                    "date_created": None,
                    "created_by": None,
                    "category": "GODATE",
                }
            else:
                # print 'Bioentity not found: ' + str(bioentity_key)
                yield None

        for pieces in make_file_starter("src/sgd/convert/data/gp_information.559292_sgd")():
            if len(pieces) >= 8:
                sgdid = pieces[8]
                if sgdid.startswith("SGD:"):
                    sgdid = sgdid[4:]
                    go_annotation = [
                        x[22:].strip() for x in pieces[9].split("|") if x.startswith("go_annotation_summary")
                    ]
                    if len(go_annotation) == 1:
                        if sgdid in sgdid_to_bioentity:
                            yield {
                                "bioentity": sgdid_to_bioentity[sgdid],
                                "source": key_to_source["SGD"],
                                "text": go_annotation[0],
                                "html": go_annotation[0],
                                "date_created": None,
                                "created_by": None,
                                "category": "GO",
                            }
                        else:
                            print "Bioentity not found: " + sgdid
                            yield None

        # Regulation
        file_names = [
            "src/sgd/convert/data/regulationSummaries",
            "src/sgd/convert/data/15-8regulationSummaries.txt",
            "src/sgd/convert/data/15-9regulationSummaries.txt",
            "src/sgd/convert/data/15-10regulationSummaries.txt",
            "src/sgd/convert/data/15-11regulationSummaries.txt",
        ]

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], "LOCUS")

                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        "bioentity": bioentity,
                        "source": key_to_source["SGD"],
                        "text": row[2],
                        "html": link_gene_names(
                            row[2],
                            {
                                bioentity.display_name,
                                bioentity.format_name,
                                bioentity.display_name + "P",
                                bioentity.format_name + "P",
                            },
                            nex_session,
                        ),
                        "category": "REGULATION",
                    }
                else:
                    # print 'Bioentity not found: ' + str(bioentity_key)
                    yield None

        # Phenotype
        file_names = [
            "src/sgd/convert/data/PhenotypeSummaries032015.txt",
            "src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt",
            "src/sgd/convert/data/15-7phenoSummaries.txt",
            "src/sgd/convert/data/15-8phenoSummaries.txt",
            "src/sgd/convert/data/15-9phenoSummaries.txt",
            "src/sgd/convert/data/15-10phenoSummaries.txt",
            "src/sgd/convert/data/15-11phenoSummaries.txt",
        ]

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], "LOCUS")
                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        "bioentity": bioentity,
                        "source": key_to_source["SGD"],
                        "text": row[1],
                        "html": link_gene_names(
                            row[1],
                            {
                                bioentity.display_name,
                                bioentity.format_name,
                                bioentity.display_name + "P",
                                bioentity.format_name + "P",
                            },
                            nex_session,
                        ),
                        "category": "PHENOTYPE",
                    }
                else:
                    # print 'Bioentity not found: ' + str(bioentity_key)
                    yield None

        bud_session.close()
        nex_session.close()