Example #1
0
    def go_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        slim_ids = set()
        for pieces in make_file_starter('src/sgd/convert/data/go_slim_mapping.tab.txt')():
            if len(pieces) >= 6:
                goid = pieces[5]
                slim_ids.add(goid)

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])

        for bud_obj in bud_session.query(Go).all():
            go_id = 'GO:' + str(bud_obj.go_go_id).zfill(7)
            yield {'display_name': bud_obj.go_term,
                   'source': key_to_source['GO'],
                   'description': bud_obj.go_definition,
                   'go_id': go_id,
                   'go_aspect': abbrev_to_go_aspect[bud_obj.go_aspect],
                   'is_slim': 1 if go_id in slim_ids else 0,
                   'date_created': bud_obj.date_created,
                   'created_by': bud_obj.created_by}

        bud_session.close()
        nex_session.close()
Example #2
0
    def paragraph_reference_starter():
        nex_session = nex_session_maker()

        references = nex_session.query(Reference).all()
        key_to_paragraph = dict([(x.unique_key(), x) for x in nex_session.query(Paragraph).all()])
        pubmed_id_to_reference = dict([(x.pubmed_id, x) for x in references])
        sgdid_to_reference = dict([(x.sgdid, x) for x in references])

        # LSP
        for paragraph in key_to_paragraph.values():
            if paragraph.category == "LSP":
                sgdids = [x.split(">")[0] for x in paragraph.text.split("<reference:")]
                for sgdid in sgdids:
                    if sgdid in sgdid_to_reference:
                        reference = sgdid_to_reference[sgdid]
                        yield {"paragraph_id": paragraph.id, "reference_id": reference.id}
                    else:
                        if sgdid != "<p":
                            print "Reference not found: " + sgdid

        # Regulation
        file_names = [
            "src/sgd/convert/data/regulationSummaries",
            "src/sgd/convert/data/15-8regulationSummaries.txt",
            "src/sgd/convert/data/15-9regulationSummaries.txt",
            "src/sgd/convert/data/15-10regulationSummaries.txt",
            "src/sgd/convert/data/15-11regulationSummaries.txt",
        ]

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                paragraph_key = (row[0], "BIOENTITY", "REGULATION")
                for pubmed_id in [int(x) for x in row[3].strip().split("|") if x != "references" and x != ""]:
                    if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference:
                        yield {
                            "paragraph_id": key_to_paragraph[paragraph_key].id,
                            "reference_id": pubmed_id_to_reference[pubmed_id].id,
                        }
                    else:
                        print "Paragraph or reference not found: " + str(paragraph_key) + " " + str(pubmed_id)
                        yield None

        # Strain
        for strain_key, paragraph in strain_paragraphs.iteritems():
            paragraph_key = (strain_key, "STRAIN", None)
            for pubmed_id in paragraph[1]:
                if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference:
                    yield {
                        "paragraph_id": key_to_paragraph[paragraph_key].id,
                        "reference_id": pubmed_id_to_reference[pubmed_id].id,
                    }
                else:
                    print "Paragraph or reference not found: " + str(paragraph_key) + " " + str(pubmed_id)
                    yield None

        nex_session.close()
Example #3
0
    def tag_starter():
        nex_session = nex_session_maker()

        for row in make_file_starter('src/sgd/convert/data/microarray_05_14/SPELL-tags.txt')():
            tag = row[2].strip()
            for t in [x.strip() for x in tag.split('|')]:
                if t != '':
                    yield {
                        'display_name': t,
                        'description': definitions.get(t)
                    }

        nex_session.close()
Example #4
0
    def bioitem_tag_starter():
        nex_session = nex_session_maker()

        key_to_dataset = dict([(x.unique_key(), x) for x in nex_session.query(Dataset).all()])
        key_to_tag = dict([(x.unique_key(), x) for x in nex_session.query(Tag).all()])

        for row in make_file_starter('src/sgd/convert/data/microarray_05_14/SPELL-tags.txt')():
            dataset_key = (row[1].strip()[:-4], 'DATASET')
            tags = row[2].strip()
            for t in [x.strip() for x in tags.split('|')]:
                if t != '':
                    yield {
                        'bioitem': key_to_dataset[dataset_key],
                        'tag': key_to_tag[create_format_name(t)]
                    }

        nex_session.close()
Example #5
0
    def datasetcolumn_starter():
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        key_to_dataset = dict([(x.unique_key(), x) for x in nex_session.query(Dataset).all()])


        #Filename to pubmed_id, geo_id, channel_count, tags, and short description
        key_to_GSM = dict([((x[2], x[4].replace('delta', '').replace('sigma', '').strip()), x[5]) for x in make_file_starter(expression_dir + '/pmid_filename_gse_conds_tags_file_20150204.txt')()])
        key_to_GSM.update([((x[2], x[4].strip()), x[5]) for x in make_file_starter(expression_dir + '/pmid_filename_gse_conds_tags_file_20150204.txt')()])

        for path in os.listdir(expression_dir):
            if os.path.isdir(expression_dir + '/' + path):
                for file in os.listdir(expression_dir + '/' + path):
                    dataset_key = (file[:-4], 'DATASET')
                    if dataset_key in key_to_dataset:
                        f = open(expression_dir + '/' + path + '/' + file, 'r')
                        pieces = f.next().split('\t')
                        f.close()

                        geo_id = key_to_dataset[dataset_key].geo_id

                        i = 0
                        for piece in pieces[3:]:
                            column_name = piece.strip().decode('ascii','ignore')

                            if (geo_id, column_name) not in key_to_GSM and geo_id is not None:
                                print (geo_id, column_name)
                            #print (geo_id, column_name)
                            col_geo_id = None if (geo_id, column_name) not in key_to_GSM else key_to_GSM[(geo_id, column_name)]
                            link = None if col_geo_id is None else 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=' + col_geo_id
                            yield {
                                        'description': column_name,
                                        'dataset': key_to_dataset[dataset_key],
                                        'source': key_to_source['SGD'],
                                        'file_order': i,
                                        'geo_id': col_geo_id,
                                        'link': link
                            }
                            i += 1

        nex_session.close()
Example #6
0
    def dataset_starter():
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        pubmed_id_to_reference = dict([(x.pubmed_id, x) for x in nex_session.query(Reference).all()])

        #Filename to pubmed_id, geo_id, channel_count, tags, and short description
        filename_to_info = dict([(x[1], (int(x[0]), x[2], x[3], x[6], None)) for x in make_file_starter(expression_dir + '/pmid_filename_gse_conds_tags_file_20150204.txt')()])

        for path in os.listdir(expression_dir):
            if os.path.isdir(expression_dir + '/' + path):
                full_description = None
                geo_id = None
                pubmed_id = None

                state = 'BEGIN'
                try:
                    for row in make_file_starter(expression_dir + '/' + path + '/README')():
                        if row[0].startswith('Full Description'):
                            state = 'FULL_DESCRIPTION:'
                            full_description = row[0][18:].strip()
                        elif row[0].startswith('PMID:'):
                            pubmed_id = int(row[0][6:].strip())
                        elif row[0].startswith('GEO ID:'):
                            geo_id = row[0][8:].strip().split('.')[0].split('GPL')[0]
                        elif row[0].startswith('PCL filename'):
                            state = 'OTHER'
                        elif state == 'FULL_DESCRIPTION':
                            full_description = full_description + row[0].strip()
                        elif state == 'OTHER':
                            pcl_filename = row[0].strip()
                            short_description = row[1].strip()
                            tag = row[3].strip()
                            if pcl_filename in filename_to_info:
                                filename_to_info[pcl_filename] = (filename_to_info[pcl_filename][0], filename_to_info[pcl_filename][1], filename_to_info[pcl_filename][2], filename_to_info[pcl_filename][3], short_description)
                            else:
                                if geo_id == 'N/A':
                                    geo_id = None
                                filename_to_info[pcl_filename] = (pubmed_id, geo_id, 1, tag, short_description)

                    for file in os.listdir(expression_dir + '/' + path):
                        if file != 'README':
                            f = open(expression_dir + '/' + path + '/' + file, 'r')
                            pieces = f.next().split('\t')
                            f.close()

                            if file in filename_to_info:
                                pubmed_id, geo_id, channel_count, tags, short_description = filename_to_info[file]

                                if pubmed_id not in pubmed_id_to_reference:
                                    print 'Warning: pubmed_id not found ' + str(pubmed_id)

                                yield {
                                    'description': full_description,
                                    'geo_id': geo_id,
                                    'pcl_filename': file,
                                    'short_description': short_description,
                                    'tags': tags,
                                    'reference': None if pubmed_id is None or pubmed_id not in pubmed_id_to_reference else pubmed_id_to_reference[pubmed_id],
                                    'source': key_to_source['SGD'],
                                    'channel_count': channel_count,
                                    'condition_count': len(pieces)-3
                                }
                            else:
                                print 'Filename not in readme: ' + file
                except:
                    print 'File ' + expression_dir + '/' + path + '/README' + ' not found.'
                    print traceback.format_exc()

        nex_session.close()
Example #7
0
    def domain_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])

        panther_id_to_description = {}
        for row in make_file_starter('src/sgd/convert/data/PANTHER9.0_HMM_classifications.txt')():
            panther_id_to_description[row[0]] = row[1].lower()

        for row in make_file_starter('src/sgd/convert/data/domains.tab')():
            source_key = row[3].strip()
            if source_key.startswith('ProSite'):
                source_key = 'Prosite'
            if source_key.startswith('SignalP'):
                source_key = 'SignalP'
            if source_key.startswith('Hamap'):
                source_key = 'HAMAP'
            if source_key == 'Coils':
                source_key = '-'

            display_name = row[4].strip()
            description = row[5].strip()
            interpro_id = None
            interpro_description = None
            if len(row) == 13:
                interpro_id = row[11].strip()
                interpro_description = row[12].strip()

            source_key = create_format_name(source_key)
            source = None if source_key not in key_to_source else key_to_source[source_key]

            description = None if description == '' else description
            interpro_description = None if interpro_description == '' else interpro_description
            interpro_id = None if interpro_id == '' else interpro_id

            if source_key == 'PANTHER':
                if display_name in panther_id_to_description:
                    yield {'display_name': display_name,
                       'source': source,
                       'description': panther_id_to_description[display_name],
                       'bioitem_type': source_key,
                       'interpro_id': interpro_id,
                       'interpro_description': interpro_description}

            elif source_key is not None:
                yield {'display_name': display_name,
                       'source': source,
                       'description': description if description is not None else interpro_description,
                       'bioitem_type': source_key,
                       'interpro_id': interpro_id,
                       'interpro_description': interpro_description}
            else:
                print 'Source not found: ' + source_key

        for row in make_file_starter('src/sgd/convert/data/TF_family_class_accession04302013.txt')():
            description = 'Class: ' + row[4] + ', Family: ' + row[3]
            yield {'display_name': row[0],
                   'source': key_to_source['JASPAR'],
                   'description': description,
                   'bioitem_type': 'JASPAR'}

        yield {'display_name': 'predicted signal peptide',
               'source': key_to_source['SignalP'],
               'description': 'predicted signal peptide',
               'bioitem_type': 'SignalP'}
        yield {'display_name': 'predicted transmembrane domain',
               'source': key_to_source['TMHMM'],
               'description': 'predicted transmembrane domain',
               'bioitem_type': 'TMHMM'}

        for bud_obj in bud_session.query(Dbxref).filter(or_(Dbxref.dbxref_type == 'PANTHER', Dbxref.dbxref_type == 'Prosite')).all():
            dbxref_type = bud_obj.dbxref_type
            source_key = create_format_name(bud_obj.source)
            source = None if source_key not in key_to_source else key_to_source[source_key]
            if source is None:
                print source_key
                yield None
            bioitem_type = None
            if dbxref_type == 'Prosite ID':
                bioitem_type = 'Prosite'
            elif dbxref_type == 'PANTHER':
                bioitem_type = 'PANTHER'

            if bioitem_type == 'PANTHER':
                if display_name in panther_id_to_description:
                    yield {'display_name': bud_obj.dbxref_id,
                       'source': source,
                       'description': panther_id_to_description[bud_obj.dbxref_id],
                       'bioitem_type': bioitem_type}
            else:
                yield {'display_name': bud_obj.dbxref_id,
                       'source': source,
                       'description': bud_obj.dbxref_name,
                       'bioitem_type': bioitem_type}

        bud_session.close()
        nex_session.close()
Example #8
0
    def experiment_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])

        for bud_obj in make_obo_file_starter('src/sgd/convert/data/eco.obo')():
            description = None if 'def' not in bud_obj else bud_obj['def']
            if description is not None and description.find('[') >= 0:
                description = description[:description.find('[')-1]
            if description is not None and description.find('"') >= 0:
                description = description[1:-1]
            yield {'display_name': bud_obj['name'],
                   'source': key_to_source['ECO'],
                   'description': description,
                   'eco_id': bud_obj['id']}

        for bud_obj in bud_session.query(CVTerm).filter(CVTerm.cv_no==7).all():
            format_name = create_format_name(bud_obj.name)
            yield {'display_name': bud_obj.name,
                   'source': key_to_source['SGD'],
                   'description': bud_obj.definition,
                   'category': 'large-scale survey' if format_name in large_scale_survey else 'classical genetics' if format_name in classical_genetics else None,
                   'date_created': bud_obj.date_created,
                   'created_by': bud_obj.created_by}

        for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Venters_Macisaac_Hu05-12-2014_regulator_lines')():
            source_key = row[11].strip()
            if source_key in key_to_source:
                yield {'display_name': row[4] if row[4] != '' else row[5],
                       'source': None if source_key not in key_to_source else key_to_source[source_key],
                       'eco_id': row[5]}
            else:
                print 'Source not found: ' + str(source_key)

        for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/SGD_data_05_14_2014')():
            source_key = row[11].strip()
            if source_key in key_to_source:
                yield {'display_name': row[4] if row[4] != '' else row[5],
                       'source': None if source_key not in key_to_source else key_to_source[source_key],
                       'eco_id': row[5]}
            else:
                print 'Source not found: ' + str(source_key)

        for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Madhani_fixed')():
            if len(row) >= 10:
                if source_key in key_to_source:
                    source_key = row[11].strip()
                    yield {'display_name': row[5] if row[5] != '' else row[4],
                           'source': None if source_key not in key_to_source else key_to_source[source_key],
                           'eco_id': row[4]}
                else:
                    print 'Source not found: ' + str(source_key)

        for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Pimentel_PMID22616008.txt')():
            if len(row) >= 10:
                if source_key in key_to_source:
                    source_key = row[11].strip()
                    yield {'display_name': row[4] if row[4] != '' else row[5],
                           'source': None if source_key not in key_to_source else key_to_source[source_key],
                           'eco_id': row[5]}
                else:
                    print 'Source not found: ' + str(source_key)

        for row in make_file_starter('src/sgd/convert/data/yetfasco_data.txt', delimeter=';')():
            expert_confidence = row[8][1:-1]
            if expert_confidence == 'High':
                yield {'display_name': row[9][1:-1],
                    'source': key_to_source['YeTFaSCo']}

        yield {'display_name': 'protein abundance', 'source': key_to_source['SGD']}
        yield {'display_name': 'EXP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/exp-inferred-experiment', 'description': 'Inferred from Experiment'}
        yield {'display_name': 'IDA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ida-inferred-direct-assay', 'description': 'Inferred from Direct Assay'}
        yield {'display_name': 'IPI', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ipi-inferred-physical-interaction', 'description': 'Inferred from Physical Interaction'}
        yield {'display_name': 'IMP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/imp-inferred-mutant-phenotype', 'description': 'Inferred from Mutant Phenotype'}
        yield {'display_name': 'IGI', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/igi-inferred-genetic-interaction', 'description': 'Inferred from Genetic Interaction'}
        yield {'display_name': 'IEP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iep-inferred-expression-pattern', 'description': 'Inferred from Expression Pattern'}
        yield {'display_name': 'ISS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iss-inferred-sequence-or-structural-similarity', 'description': 'Inferred from Sequence or Structural Similarity'}
        yield {'display_name': 'ISA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/isa-inferred-sequence-alignment', 'description': 'Inferred from Sequence Alignment'}
        yield {'display_name': 'ISO', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iso-inferred-sequence-orthology', 'description': 'Inferred from Sequence Orthology'}
        yield {'display_name': 'ISM', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ism-inferred-sequence-model', 'description': 'Inferred from Sequence Model'}
        yield {'display_name': 'IGC', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/igc-inferred-genomic-context', 'description': 'Inferred from Genomic Context'}
        yield {'display_name': 'IBA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iba-inferred-biological-aspect-ancestor', 'description': 'Inferred from Biological aspect of Ancestor'}
        yield {'display_name': 'IBD', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ibd-inferred-biological-aspect-descendent', 'description': 'Inferred from Biological aspect of Descendent'}
        yield {'display_name': 'IKR', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ikr-inferred-key-residues', 'description': 'Inferred from Key Residues'}
        yield {'display_name': 'IRD', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ird-inferred-rapid-divergence', 'description': 'Inferred from Rapid Divergence'}
        yield {'display_name': 'RCA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/rca-inferred-reviewed-computational-analysis', 'description': 'inferred from Reviewed Computational Analysis'}
        yield {'display_name': 'TAS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/tas-traceable-author-statement', 'description': 'Traceable Author Statement'}
        yield {'display_name': 'NAS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/nas-non-traceable-author-statement', 'description': 'Non-traceable Author Statement'}
        yield {'display_name': 'IC', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ic-inferred-curator', 'description': 'Inferred by Curator'}
        yield {'display_name': 'ND', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/nd-no-biological-data-available', 'description': 'No Biological Data Available'}
        yield {'display_name': 'IEA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/automatically-assigned-evidence-codes', 'description': 'Inferred from Electronic Annotation'}

        bud_session.close()
        nex_session.close()
Example #9
0
    def paragraph_reference_starter():
        nex_session = nex_session_maker()

        references = nex_session.query(Reference).all()
        key_to_paragraph = dict([(x.unique_key(), x) for x in nex_session.query(Paragraph).all()])
        pubmed_id_to_reference = dict([(x.pubmed_id, x) for x in references])
        sgdid_to_reference = dict([(x.sgdid, x) for x in references])

        #LSP
        for paragraph in key_to_paragraph.values():
            if paragraph.category == 'LSP':
                sgdids = [x.split('>')[0] for x in paragraph.text.split('<reference:')]
                for sgdid in sgdids:
                    if sgdid in sgdid_to_reference:
                        reference = sgdid_to_reference[sgdid]
                        yield {
                                'paragraph_id': paragraph.id,
                                'reference_id': reference.id
                            }
                    else:
                        if sgdid != '<p':
                            print 'Reference not found: ' + sgdid

        #Regulation
        file_names = ['src/sgd/convert/data/regulationSummaries',
                      'src/sgd/convert/data/15-8regulationSummaries.txt',
                      'src/sgd/convert/data/15-9regulationSummaries.txt',
                      'src/sgd/convert/data/15-10regulationSummaries.txt',
                      'src/sgd/convert/data/15-11regulationSummaries.txt',
                      'src/sgd/convert/data/16-1regulationSummaries.txt',
                      'src/sgd/convert/data/16-2regulationSummaries.txt',
                      'src/sgd/convert/data/16-3regulationSummaries.txt',
                      'src/sgd/convert/data/16-4regulationSummaries.txt',
                      'src/sgd/convert/data/16-5regulationSummaries.txt']

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                paragraph_key = (row[0], 'BIOENTITY', 'REGULATION')
                for pubmed_id in [int(x) for x in row[3].strip().split('|') if x != 'references' and x != '']:
                    if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference:
                        yield {
                            'paragraph_id': key_to_paragraph[paragraph_key].id,
                            'reference_id': pubmed_id_to_reference[pubmed_id].id,
                        }
                    else:
                        print 'Paragraph or reference not found: ' + str(paragraph_key) + ' ' + str(pubmed_id)
                        yield None

        #Strain
        for strain_key, paragraph in strain_paragraphs.iteritems():
            paragraph_key = (strain_key, 'STRAIN', None)
            for pubmed_id in paragraph[1]:
                if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference:
                    yield {
                        'paragraph_id': key_to_paragraph[paragraph_key].id,
                        'reference_id': pubmed_id_to_reference[pubmed_id].id,
                    }
                else:
                    print 'Paragraph or reference not found: ' + str(paragraph_key) + ' ' + str(pubmed_id)
                    yield None



        nex_session.close()
Example #10
0
    def bioentity_paragraph_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()])
        id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()])
        sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()])
        sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()])
        goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()])

        #LSP
        for feature in bud_session.query(Feature).all():
            paragraph_feats = feature.paragraph_feats
            if len(paragraph_feats) > 0 and feature.id in id_to_bioentity:
                paragraph_feats.sort(key=lambda x: x.order)
                paragraph_html, paragraph_text = clean_paragraph(id_to_bioentity[feature.id], '<p>' + ('</p><p>'.join([x.paragraph.text for x in paragraph_feats])) + '</p>', str([x.paragraph.id for x in paragraph_feats]), sgdid_to_reference, sgdid_to_bioentity, goid_to_go)
                
                date_edited = None
                year = 0
                month = 0
                day = 0
                for paragraph_feat in paragraph_feats:
                    my_date = paragraph_feat.paragraph.date_edited
                    this_date = str(my_date).split(' ')[0].replace('-0', '-').split('-')
                    this_year = int(this_date[0])
                    this_month = int(this_date[1])
                    this_day = int(this_date[2])
                    if date_edited is None or datetime(this_year, this_month, this_day) > datetime(year, month, day):
                        date_edited = my_date
                        year = this_year
                        month = this_month
                        day = this_day

                yield {
                    'bioentity': id_to_bioentity[feature.id],
                    'source': key_to_source['SGD'],
                    'text': paragraph_text,
                    'html': paragraph_html,
                    'date_edited': date_edited,
                    'date_created': paragraph_feats[0].paragraph.date_created,
                    'created_by': paragraph_feats[0].paragraph.created_by,
                    'category': 'LSP'
                }

        bioentity_key_to_date = dict()
        #Go
        for gofeature in bud_session.query(GoFeature).all():
            bioentity_key = (gofeature.feature.name, 'LOCUS')
            if gofeature.annotation_type == 'manually curated' and bioentity_key not in bioentity_key_to_date:
                bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed

        for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems():
            if bioentity_key in key_to_bioentity:
                yield {
                    'bioentity': key_to_bioentity[bioentity_key],
                    'source': key_to_source['SGD'],
                    'text': str(date_last_reviewed),
                    'html': str(date_last_reviewed),
                    'date_created': None,
                    'created_by': None,
                    'category': 'GODATE'
                }
            else:
                #print 'Bioentity not found: ' + str(bioentity_key)
                yield None

        for pieces in make_file_starter('src/sgd/convert/data/gp_information.559292_sgd')():
            if len(pieces) >= 8:
                sgdid = pieces[8]
                if sgdid.startswith('SGD:'):
                    sgdid = sgdid[4:]
                    go_annotation = [x[22:].strip() for x in pieces[9].split('|') if x.startswith('go_annotation_summary')]
                    if len(go_annotation) == 1:
                        if sgdid in sgdid_to_bioentity:
                            yield {
                                'bioentity': sgdid_to_bioentity[sgdid],
                                'source': key_to_source['SGD'],
                                'text': go_annotation[0],
                                'html': go_annotation[0],
                                'date_created': None,
                                'created_by': None,
                                'category': 'GO'
                            }
                        else:
                            print 'Bioentity not found: ' + sgdid
                            yield None

        #Regulation
        file_names = ['src/sgd/convert/data/regulationSummaries',
                      'src/sgd/convert/data/15-8regulationSummaries.txt',
                      'src/sgd/convert/data/15-9regulationSummaries.txt',
                      'src/sgd/convert/data/15-10regulationSummaries.txt',
                      'src/sgd/convert/data/15-11regulationSummaries.txt',
                      'src/sgd/convert/data/16-1regulationSummaries.txt',
                      'src/sgd/convert/data/16-2regulationSummaries.txt',
                      'src/sgd/convert/data/16-3regulationSummaries.txt',
                      'src/sgd/convert/data/16-4regulationSummaries.txt',
                      'src/sgd/convert/data/16-5regulationSummaries.txt']

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], 'LOCUS')

                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        'bioentity': bioentity,
                        'source': key_to_source['SGD'],
                        'text': row[2],
                        'html': link_gene_names(row[2], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session),
                        'category': 'REGULATION'
                    }
                else:
                    #print 'Bioentity not found: ' + str(bioentity_key)
                    yield None

        #Phenotype
        file_names = ['src/sgd/convert/data/PhenotypeSummaries032015.txt',
                      'src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt',
                      'src/sgd/convert/data/15-7phenoSummaries.txt',
                      'src/sgd/convert/data/15-8phenoSummaries.txt',
                      'src/sgd/convert/data/15-9phenoSummaries.txt',
                      'src/sgd/convert/data/15-10phenoSummaries.txt',
                      'src/sgd/convert/data/15-11phenoSummaries.txt',
                      'src/sgd/convert/data/15-12phenoSummaries.txt',
                      'src/sgd/convert/data/16-1phenoSummaries.txt',
                      'src/sgd/convert/data/16-2phenoSummaries.txt',
                      'src/sgd/convert/data/16-3phenoSummaries.txt',
                      'src/sgd/convert/data/16-4phenoSummaries.txt',
                      'src/sgd/convert/data/16-5phenoSummaries.txt',
                      'src/sgd/convert/data/16-6phenoSummaries.txt',
                      'src/sgd/convert/data/16-7phenoSummaries.txt',
                      'src/sgd/convert/data/16-9phenoSummaries.txt',
                      'src/sgd/convert/data/16-10phenoSummaries.txt']

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], 'LOCUS')
                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        'bioentity': bioentity,
                        'source': key_to_source['SGD'],
                        'text': row[1],
                        'html': link_gene_names(row[1], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session),
                        'category': 'PHENOTYPE'
                        }
                else:
                    #print 'Bioentity not found: ' + str(bioentity_key)
                    yield None


        bud_session.close()
        nex_session.close()
Example #11
0
    def bioentity_paragraph_starter():
        bud_session = bud_session_maker()
        nex_session = nex_session_maker()

        key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()])
        key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()])
        id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()])
        sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()])
        sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()])
        goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()])

        # LSP
        for feature in bud_session.query(Feature).all():
            paragraph_feats = feature.paragraph_feats
            if len(paragraph_feats) > 0 and feature.id in id_to_bioentity:
                paragraph_feats.sort(key=lambda x: x.order)
                paragraph_html, paragraph_text = clean_paragraph(
                    id_to_bioentity[feature.id],
                    "<p>" + ("</p><p>".join([x.paragraph.text for x in paragraph_feats])) + "</p>",
                    str([x.paragraph.id for x in paragraph_feats]),
                    sgdid_to_reference,
                    sgdid_to_bioentity,
                    goid_to_go,
                )
                yield {
                    "bioentity": id_to_bioentity[feature.id],
                    "source": key_to_source["SGD"],
                    "text": paragraph_text,
                    "html": paragraph_html,
                    "date_edited": paragraph_feats[0].paragraph.date_edited,
                    "date_created": paragraph_feats[0].paragraph.date_created,
                    "created_by": paragraph_feats[0].paragraph.created_by,
                    "category": "LSP",
                }

        bioentity_key_to_date = dict()
        # Go
        for gofeature in bud_session.query(GoFeature).all():
            bioentity_key = (gofeature.feature.name, "LOCUS")
            if gofeature.annotation_type == "manually curated" and bioentity_key not in bioentity_key_to_date:
                bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed

        for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems():
            if bioentity_key in key_to_bioentity:
                yield {
                    "bioentity": key_to_bioentity[bioentity_key],
                    "source": key_to_source["SGD"],
                    "text": str(date_last_reviewed),
                    "html": str(date_last_reviewed),
                    "date_created": None,
                    "created_by": None,
                    "category": "GODATE",
                }
            else:
                # print 'Bioentity not found: ' + str(bioentity_key)
                yield None

        for pieces in make_file_starter("src/sgd/convert/data/gp_information.559292_sgd")():
            if len(pieces) >= 8:
                sgdid = pieces[8]
                if sgdid.startswith("SGD:"):
                    sgdid = sgdid[4:]
                    go_annotation = [
                        x[22:].strip() for x in pieces[9].split("|") if x.startswith("go_annotation_summary")
                    ]
                    if len(go_annotation) == 1:
                        if sgdid in sgdid_to_bioentity:
                            yield {
                                "bioentity": sgdid_to_bioentity[sgdid],
                                "source": key_to_source["SGD"],
                                "text": go_annotation[0],
                                "html": go_annotation[0],
                                "date_created": None,
                                "created_by": None,
                                "category": "GO",
                            }
                        else:
                            print "Bioentity not found: " + sgdid
                            yield None

        # Regulation
        file_names = [
            "src/sgd/convert/data/regulationSummaries",
            "src/sgd/convert/data/15-8regulationSummaries.txt",
            "src/sgd/convert/data/15-9regulationSummaries.txt",
            "src/sgd/convert/data/15-10regulationSummaries.txt",
            "src/sgd/convert/data/15-11regulationSummaries.txt",
        ]

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], "LOCUS")

                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        "bioentity": bioentity,
                        "source": key_to_source["SGD"],
                        "text": row[2],
                        "html": link_gene_names(
                            row[2],
                            {
                                bioentity.display_name,
                                bioentity.format_name,
                                bioentity.display_name + "P",
                                bioentity.format_name + "P",
                            },
                            nex_session,
                        ),
                        "category": "REGULATION",
                    }
                else:
                    # print 'Bioentity not found: ' + str(bioentity_key)
                    yield None

        # Phenotype
        file_names = [
            "src/sgd/convert/data/PhenotypeSummaries032015.txt",
            "src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt",
            "src/sgd/convert/data/15-7phenoSummaries.txt",
            "src/sgd/convert/data/15-8phenoSummaries.txt",
            "src/sgd/convert/data/15-9phenoSummaries.txt",
            "src/sgd/convert/data/15-10phenoSummaries.txt",
            "src/sgd/convert/data/15-11phenoSummaries.txt",
        ]

        for file_name in file_names:
            for row in make_file_starter(file_name)():
                bioentity_key = (row[0], "LOCUS")
                if bioentity_key in key_to_bioentity:
                    bioentity = key_to_bioentity[bioentity_key]
                    yield {
                        "bioentity": bioentity,
                        "source": key_to_source["SGD"],
                        "text": row[1],
                        "html": link_gene_names(
                            row[1],
                            {
                                bioentity.display_name,
                                bioentity.format_name,
                                bioentity.display_name + "P",
                                bioentity.format_name + "P",
                            },
                            nex_session,
                        ),
                        "category": "PHENOTYPE",
                    }
                else:
                    # print 'Bioentity not found: ' + str(bioentity_key)
                    yield None

        bud_session.close()
        nex_session.close()
Example #12
0
    clean_up_orphans(nex_session_maker, DNAsequenceevidence, Evidence, 'DNASEQUENCE')
    for sequence_filename, coding_sequence_filename, strain_key in new_sequence_files:
        do_conversion(make_new_dna_sequence_evidence_starter(nex_session_maker, strain_key, sequence_filename, coding_sequence_filename),
                      [Json2Obj(DNAsequenceevidence),
                       Obj2NexDB(nex_session_maker, lambda x: x.query(DNAsequenceevidence).filter(DNAsequenceevidence.strain_id == strain_key_to_id[strain_key]).filter(DNAsequenceevidence.dna_type != '1KB'),
                                 name='convert.from_bud.evidence.dnasequence',
                                 delete_untouched=True,
                                 commit_interval=1000)])


    update_contig_centromeres(nex_session_maker)
    update_contig_reference_alignment(nex_session_maker)


    protparam_data = dict([(row[0], row) for row in make_file_starter('src/sgd/convert/data/ProtParam.txt')()])
    for sequence_filename, strain_key in protein_sequence_files:
        do_conversion(make_protein_sequence_evidence_starter(nex_session_maker, strain_key, sequence_filename, protparam_data),
                      [Json2Obj(Proteinsequenceevidence),
                       Obj2NexDB(nex_session_maker, lambda x: x.query(Proteinsequenceevidence).filter(Proteinsequenceevidence.strain_id == strain_key_to_id[strain_key]), name='convert.from_bud.evidence.proteinsequence', delete_untouched=True, commit_interval=1000)])
    clean_up_orphans(nex_session_maker, Proteinsequenceevidence, Evidence, 'PROTEINSEQUENCE')

    do_conversion(make_kb_sequence_starter(nex_session_maker),
                      [Json2Obj(DNAsequenceevidence),
                       Obj2NexDB(nex_session_maker, lambda x: x.query(DNAsequenceevidence).filter(DNAsequenceevidence.dna_type == '1KB'),
                                 name='convert.from_bud.evidence.1kb_dnasequence',
                                 delete_untouched=True,
                                 commit_interval=1000)])