def parse_fasta(databases_directory, config, import_directory, download=True, updated_on=None): stats = set() url = config['uniprot_fasta_file'] entities_output_file = os.path.join(import_directory, "Amino_acid_sequence.tsv") rel_output_file = os.path.join( import_directory, "Protein_HAS_Sequence_Amino_acid_sequence.tsv") directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) ff = builder_utils.read_gzipped_file(file_name) records = builder_utils.parse_fasta(ff) num_entities = 0 with open(entities_output_file, 'w', encoding='utf-8') as ef: ef.write('ID\theader\tsequence\tsize\tsource\n') with open(rel_output_file, 'w', encoding='utf-8') as rf: rf.write('START_ID\tEND_ID\tTYPE\tsource\n') for i, batch in enumerate( builder_utils.batch_iterator(records, 1000)): for record in batch: identifier = record.id.split('|')[1] header = record.id sequence = str(record.seq) sequence_len = len(str(sequence)) ef.write(identifier + "\t" + header + '\t' + sequence + '\t' + str(sequence_len) + '\tUniProt\n') rf.write(identifier + '\t' + identifier + '\tHAS_SEQUENCE\tUniProt\n') num_entities += 1 stats.add( builder_utils.buildStats(num_entities, "entity", "Amino_acid_sequence", "UniProt", entities_output_file, updated_on)) stats.add( builder_utils.buildStats(num_entities, "relationships", "HAS_SEQUENCE", "UniProt", rel_output_file, updated_on)) return stats
def generate_graph_files(data, dataType, projectId, stats, ot='w', dataset_import_dir='experiments'): if dataType == '': outputfile = os.path.join(dataset_import_dir, projectId+".tsv") else: outputfile = os.path.join(dataset_import_dir, projectId+"_"+dataType.lower()+".tsv") with open(outputfile, ot, encoding="utf-8") as f: data.to_csv(path_or_buf=f, sep='\t', header=True, index=False, quotechar='"', line_terminator='\n', escapechar='\\') logger.info("Experiment {} - Number of {} relationships: {}".format(projectId, dataType, data.shape[0])) stats.add(builder_utils.buildStats(data.shape[0], "relationships", dataType, "Experiment", outputfile))
def print_single_file(data, header, output_file, data_type, data_object, is_first, updated_on): stats = set() df = pd.DataFrame(list(data), columns=header) stats.add( builder_utils.buildStats(len(data), data_type, data_object, "UniProt", output_file, updated_on)) with open(output_file, 'a', encoding='utf-8') as ef: df.to_csv(path_or_buf=ef, sep='\t', header=is_first, index=False, quotechar='"', line_terminator='\n', escapechar='\\') return stats
def print_multiple_relationships_files(data, header, output_dir, is_first, updated_on): stats = set() for entity, relationship in data: df = pd.DataFrame(list(data[(entity, relationship)]), columns=header) output_file = os.path.join( output_dir, entity + "_" + relationship.lower() + ".tsv") stats.add( builder_utils.buildStats(len(data[(entity, relationship)]), 'relationships', relationship, "UniProt", output_file, updated_on)) with open(output_file, 'a', encoding='utf-8') as ef: df.to_csv(path_or_buf=ef, sep='\t', header=is_first, index=False, quotechar='"', line_terminator='\n', escapechar='\\') return stats
def parseDatabase(importDirectory, database, download=True): stats = set() updated_on = None if download: updated_on = str(date.today()) try: logger.info("Parsing database {}".format(database)) database_directory = ckg_config['databases_directory'] if database.lower() == "jensenlab": result = jensenlabParser.parser(database_directory, download) for qtype in result: relationships, header, outputfileName = result[qtype] outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, qtype, len(relationships))) stats.add( builder_utils.buildStats(len(relationships), "relationships", qtype, database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "mentions": num_entities, outputfile = textminingParser.parser( database_directory, importDirectory, download) logger.info("Database {} - Number of {} entities: {}".format( database, "Publication", num_entities)) stats.add( builder_utils.buildStats(num_entities, "entity", "Publication", database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "hgnc": #HGNC entities, header = hgncParser.parser(database_directory, download) outputfile = os.path.join(importDirectory, "Gene.tsv") builder_utils.write_entities(entities, header, outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Gene", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Gene", database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "refseq": entities, relationships, headers = refseqParser.parser( database_directory, download) for entity in entities: header = headers[entity] outputfile = os.path.join(importDirectory, entity + ".tsv") builder_utils.write_entities(entities[entity], header, outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, entity, len(entities[entity]))) stats.add( builder_utils.buildStats(len(entities[entity]), "entity", entity, database, outputfile, updated_on)) for rel in relationships: header = headers[rel] outputfile = os.path.join(importDirectory, "refseq_" + rel.lower() + ".tsv") builder_utils.write_relationships(relationships[rel], header, outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, rel, len(relationships[rel]))) stats.add( builder_utils.buildStats(len(relationships[rel]), "relationships", rel, database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "uniprot": #UniProt stats.update( uniprotParser.parser(database_directory, importDirectory, download, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "pfam": #UniProt stats.update( pfamParser.parser(database_directory, importDirectory, download, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "intact": #IntAct relationships, header, outputfileName = intactParser.parser( database_directory, download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format( database, "curated_interacts_with", len(relationships))) stats.add( builder_utils.buildStats(len(relationships), "relationships", "curated_interacts_with", database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "mutationds": #MutationDs relationships, header, outputfileName = mutationDsParser.parser( database_directory, download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format( database, "curated_affects_interaction_with", len(relationships))) stats.add( builder_utils.buildStats(len(relationships), "relationships", "curated_affects_interaction_with", database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "string": #STRING proteinMapping, drugMapping = stringParser.parser( database_directory, importDirectory, download=download) stringParser.parseActions(database_directory, importDirectory, proteinMapping, drugMapping, download=download, db="STRING") print("Done Parsing database {}".format(database)) elif database.lower() == "stitch": #STITCH proteinMapping, drugMapping = stringParser.parser( database_directory, importDirectory, drug_source=dbconfig["sources"]["Drug"], download=download, db="STITCH") stringParser.parseActions(database_directory, importDirectory, proteinMapping, drugMapping, download=download, db="STITCH") print("Done Parsing database {}".format(database)) elif database.lower() == "disgenet": #DisGeNet relationships, header, outputfileName = disgenetParser.parser( database_directory, download) for idType in relationships: outputfile = os.path.join(importDirectory, idType + "_" + outputfileName) builder_utils.write_relationships(relationships[idType], header, outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, idType, len(relationships[idType]))) stats.add( builder_utils.buildStats(len(relationships[idType]), "relationships", idType, database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "pathwaycommons": #PathwayCommons pathways entities, relationships, entities_header, relationships_header = pathwayCommonsParser.parser( database_directory, download) entity_outputfile = os.path.join(importDirectory, "Pathway.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) stats.add( builder_utils.buildStats(len(entities), "entity", "Pathway", database, entity_outputfile, updated_on)) pathway_outputfile = os.path.join( importDirectory, "pathwaycommons_protein_associated_with_pathway.tsv") builder_utils.write_relationships(relationships, relationships_header, pathway_outputfile) logger.info("Database {} - Number of {} relationships: {}".format( database, "protein_associated_with_pathway", len(relationships))) stats.add( builder_utils.buildStats(len(relationships), "relationships", "protein_associated_with_pathway", database, pathway_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "reactome": #Reactome entities, relationships, entities_header, relationships_header = reactomeParser.parser( database_directory, download) entity_outputfile = os.path.join(importDirectory, database.lower() + "_Pathway.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) stats.add( builder_utils.buildStats(len(entities), "entity", "Pathway", database, entity_outputfile, updated_on)) for entity, relationship in relationships: reactome_outputfile = os.path.join( importDirectory, database.lower() + "_" + entity.lower() + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], relationships_header[entity], reactome_outputfile) logger.info( "Database {} - Number of {} {} relationships: {}".format( database, entity, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, reactome_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "smpdb": #SMPDB entities, relationships, entities_header, relationships_header = smpdbParser.parser( database_directory, download) entity_outputfile = os.path.join(importDirectory, database.lower() + "_Pathway.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) stats.add( builder_utils.buildStats(len(entities), "entity", "Pathway", database, entity_outputfile, updated_on)) for entity, relationship in relationships: smpdb_outputfile = os.path.join( importDirectory, database.lower() + "_" + entity.lower() + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], relationships_header[entity], smpdb_outputfile) logger.info( "Database {} - Number of {} {} relationships: {}".format( database, entity, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, smpdb_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "dgidb": relationships, header, outputfileName = drugGeneInteractionDBParser.parser( database_directory, download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format( database, "targets", len(relationships))) stats.add( builder_utils.buildStats(len(relationships), "relationships", "targets", database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "sider": relationships, header, outputfileName, drugMapping, phenotypeMapping = siderParser.parser( database_directory, dbconfig["sources"]["Drug"], download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format( database, "has_side_effect", len(relationships))) stats.add( builder_utils.buildStats(len(relationships), "relationships", "has_side_effect", database, outputfile, updated_on)) relationships, header, outputfileName = siderParser.parserIndications( database_directory, drugMapping, phenotypeMapping, download=download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format( database, "indicated_for", len(relationships))) stats.add( builder_utils.buildStats(len(relationships), "relationships", "indicated_for", database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "oncokb": entities, relationships, entities_header, relationships_headers = oncokbParser.parser( database_directory, download) outputfile = os.path.join( importDirectory, "oncokb_Clinically_relevant_variant.tsv") builder_utils.write_entities(entities, entities_header, outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Clinically_relevant_variant", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Clinically_relevant_variant", database, outputfile, updated_on)) for relationship in relationships: oncokb_outputfile = os.path.join( importDirectory, "oncokb_" + relationship + ".tsv") if relationship in relationships_headers: header = relationships_headers[relationship] else: header = ['START_ID', 'END_ID', 'TYPE'] builder_utils.write_relationships(relationships[relationship], header, oncokb_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[relationship]))) stats.add( builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "cancergenomeinterpreter": entities, relationships, entities_header, relationships_headers = cancerGenomeInterpreterParser.parser( database_directory, download) entity_outputfile = os.path.join( importDirectory, "cgi_Clinically_relevant_variant.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Clinically_relevant_variant", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Clinically_relevant_variant", database, entity_outputfile, updated_on)) for relationship in relationships: cgi_outputfile = os.path.join(importDirectory, "cgi_" + relationship + ".tsv") header = ['START_ID', 'END_ID', 'TYPE'] if relationship in relationships_headers: header = relationships_headers[relationship] builder_utils.write_relationships(relationships[relationship], header, cgi_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[relationship]))) stats.add( builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, cgi_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "hmdb": entities, relationships, entities_header, relationships_header = hmdbParser.parser( database_directory, download) entity_outputfile = os.path.join(importDirectory, "Metabolite.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Metabolite", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Metabolite", database, entity_outputfile, updated_on)) for relationship in relationships: hmdb_outputfile = os.path.join(importDirectory, relationship + ".tsv") builder_utils.write_relationships(relationships[relationship], relationships_header, hmdb_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[relationship]))) stats.add( builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, hmdb_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "drugbank": entities, relationships, entities_header, relationships_headers = drugBankParser.parser( database_directory) entity_outputfile = os.path.join(importDirectory, "Drug.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Drug", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Drug", database, entity_outputfile, updated_on)) for relationship in relationships: relationship_outputfile = os.path.join(importDirectory, relationship + ".tsv") header = ['START_ID', 'END_ID', 'TYPE', 'source'] if relationship in relationships_headers: header = relationships_headers[relationship] builder_utils.write_relationships(relationships[relationship], header, relationship_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[relationship]))) stats.add( builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, relationship_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "gwascatalog": entities, relationships, entities_header, relationships_header = gwasCatalogParser.parser( database_directory, download) entity_outputfile = os.path.join(importDirectory, "GWAS_study.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "GWAS_study", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "GWAS_study", database, entity_outputfile, updated_on)) for relationship in relationships: header = ['START_ID', 'END_ID', 'TYPE', 'source'] if relationship in relationships_header: header = relationships_header[relationship] outputfile = os.path.join( importDirectory, "GWAS_study_" + relationship + ".tsv") builder_utils.write_relationships(relationships[relationship], header, outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[relationship]))) stats.add( builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "phosphositeplus": entities, relationships, entities_header, relationships_headers = pspParser.parser( database_directory) entity_outputfile = os.path.join(importDirectory, "psp_Modified_protein.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Modified_protein", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Modified_protein", database, entity_outputfile, updated_on)) for entity, relationship in relationships: rel_header = ["START_ID", "END_ID", "TYPE", "source"] if entity in relationships_headers: rel_header = relationships_headers[entity] outputfile = os.path.join( importDirectory, "psp_" + entity.lower() + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], rel_header, outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "signor": entities, relationships, entities_header, relationships_headers = signorParser.parser( database_directory) entity_outputfile = os.path.join(importDirectory, "signor_Modified_protein.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Modified_protein", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Modified_protein", database, entity_outputfile, updated_on)) for entity, relationship in relationships: rel_header = ["START_ID", "END_ID", "TYPE", "source"] prefix = 'signor_' + entity.lower() if relationship in relationships_headers: rel_header = relationships_headers[relationship] if relationship == 'mentioned_in_publication': prefix = entity outputfile = os.path.join( importDirectory, prefix + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], rel_header, outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "corum": entities, relationships, entities_header, relationships_headers = corumParser.parser( database_directory, download) entity_outputfile = os.path.join(importDirectory, "Complex.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Complex", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Complex", database, entity_outputfile, updated_on)) for entity, relationship in relationships: corum_outputfile = os.path.join( importDirectory, database.lower() + "_" + entity.lower() + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], relationships_headers[entity], corum_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, corum_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "foodb": entities, relationships, entities_header, relationships_headers = foodbParser.parser( database_directory, download) entity_outputfile = os.path.join(importDirectory, "Food.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format( database, "Food", len(entities))) stats.add( builder_utils.buildStats(len(entities), "entity", "Food", database, entity_outputfile, updated_on)) for entity, relationship in relationships: foodb_outputfile = os.path.join( importDirectory, database.lower() + "_" + entity.lower() + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], relationships_headers[entity], foodb_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, foodb_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "exposome explorer": relationships, header = exposomeParser.parser( database_directory, download) for entity, relationship in relationships: ee_outputfile = os.path.join( importDirectory, database.lower() + "_" + entity.lower() + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], header[entity], ee_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, ee_outputfile, updated_on)) print("Done Parsing database {}".format(database)) elif database.lower() == "hpa": relationships, headers = hpaParser.parser(database_directory, download) for entity, relationship in relationships: hpa_outputfile = os.path.join( importDirectory, database.lower() + "_" + entity.lower() + "_" + relationship.lower() + ".tsv") builder_utils.write_relationships( relationships[(entity, relationship)], headers[relationship], hpa_outputfile) logger.info( "Database {} - Number of {} relationships: {}".format( database, relationship, len(relationships[(entity, relationship)]))) stats.add( builder_utils.buildStats( len(relationships[(entity, relationship)]), "relationships", relationship, database, hpa_outputfile, updated_on)) print("Done Parsing database {}".format(database)) except Exception as err: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Database {}: {}, file: {},line: {}".format( database, sys.exc_info(), fname, exc_tb.tb_lineno)) return stats
def parser(databases_directory, import_directory, download=True, updated_on=None): config = builder_utils.get_config(config_name="pfamConfig.yml", data_type='databases') entity_header = config['entity_header'] relationship_headers = config['relationship_headers'] directory = os.path.join(databases_directory, 'Pfam') builder_utils.checkDirectory(directory) protein_mapping = mp.getMappingForEntity(entity="Protein") valid_proteins = list(set(protein_mapping.values())) ftp_url = config['ftp_url'] filename = config['full_uniprot_file'] # url = config['test'] if not os.path.exists(os.path.join(directory, filename)): if download: builder_utils.downloadDB(ftp_url + filename, directory) stats = set() if os.path.exists(os.path.join(directory, filename)): fhandler = builder_utils.read_gzipped_file( os.path.join(directory, filename)) identifier = None description = [] lines = [] missed = 0 entities = set() relationships = defaultdict(set) is_first = True i = 0 read_lines = 0 num_entities = 0 num_relationships = {} try: for line in fhandler: i += 1 read_lines += 1 if line.startswith("# STOCKHOLM"): if identifier is not None: entities.add((identifier, 'Functional_region', name, " ".join(description), "PFam")) if len(entities) == 100: print_files(entities, entity_header, outputfile=os.path.join( import_directory, 'Functional_region.tsv'), is_first=is_first) num_entities += len(entities) if 'mentioned_in_publication' in relationships: print_files( relationships['mentioned_in_publication'], relationship_headers[ 'mentioned_in_publication'], outputfile=os.path.join( import_directory, 'Functional_region_mentioned_in_publication.tsv' ), is_first=is_first) if 'mentioned_in_publication' not in num_relationships: num_relationships[ 'mentioned_in_publication'] = 0 num_relationships[ 'mentioned_in_publication'] += len( relationships[ 'mentioned_in_publication']) if 'found_in_protein' in relationships: print_files( relationships['found_in_protein'], relationship_headers['found_in_protein'], outputfile=os.path.join( import_directory, 'Functional_region_found_in_protein.tsv' ), is_first=is_first, filter_for=('END_ID', valid_proteins)) if 'found_in_protein' not in num_relationships: num_relationships['found_in_protein'] = 0 num_relationships['found_in_protein'] += len( relationships['found_in_protein']) entities = set() relationships = defaultdict(set) is_first = False identifier = None description = [] elif line.startswith("#=GF"): data = line.rstrip('\r\n').split() if 'AC' in data: identifier = data[2].split('.')[0] elif 'DE' in data: name = " ".join(data[2:]) elif 'RM' in data: relationships['mentioned_in_publication'].add( (identifier, data[2], "MENTIONED_IN_PUBLICATION", "PFam")) elif 'CC' in data: description.append(" ".join(data[2:])) elif not line.startswith('//'): data = line.rstrip('\r\n').split() protein, positions = data[0].split('/') protein = protein.replace('.', '-') start, end = positions.split('-') sequence = data[1] relationships['found_in_protein'].add( (identifier, protein, "FOUND_IN_PROTEIN", start, end, sequence, "PFam")) if protein.split('-')[0] != protein: relationships['found_in_protein'].add( (identifier, protein.split('-')[0], "FOUND_IN_PROTEIN", start, end, sequence, "PFam")) except UnicodeDecodeError: lines.append(i) missed += 1 fhandler.close() if len(entities) > 0: print_files(entities, entity_header, outputfile=os.path.join(import_directory, 'Functional_region.tsv'), is_first=is_first) num_entities += len(entities) print_files(relationships['mentioned_in_publication'], relationship_headers['mentioned_in_publication'], outputfile=os.path.join( import_directory, 'Functional_region_mentioned_in_publication.tsv'), is_first=is_first) num_relationships['mentioned_in_publication'] += len( relationships['mentioned_in_publication']) print_files(relationships['found_in_protein'], relationship_headers['found_in_protein'], outputfile=os.path.join( import_directory, 'Functional_region_found_in_protein.tsv'), is_first=is_first) num_relationships['found_in_protein'] += len( relationships['found_in_protein']) stats.add( builder_utils.buildStats(num_entities, "entity", "Functional_region", "Pfam", 'Functional_region.tsv', updated_on)) for rel in num_relationships: stats.add( builder_utils.buildStats(num_relationships[rel], "relationship", rel.upper(), "Pfam", 'Functional_region_' + rel + '.tsv', updated_on)) builder_utils.remove_directory(directory) return stats
def generate_graphFiles(import_directory, ontologies=None, download=True): """ This function parses and extracts data from a given list of ontologies. If no ontologies are provided, \ all availables ontologies are used. Terms, relationships and definitions are saved as .tsv files to be loaded into \ the graph database. :param str import_directory: relative path from current python module to 'imports' directory. :param ontologies: list of ontologies to be imported. If None, all available ontologies are imported. :type ontologies: list or None :param bool download: wether database is to be downloaded. :return: Dictionary of tuples. Each tuple corresponds to a unique label/relationship type, date, time, \ database, and number of nodes and relationships. """ entities = config["ontologies"] if ontologies is not None: entities = {} for ontology in ontologies: ontology = ontology.capitalize() if ontology.capitalize() in config["ontologies"]: entities.update({ontology: config["ontologies"][ontology]}) updated_on = "None" if download: updated_on = str(date.today()) stats = set() for entity in entities: ontology = config["ontologies"][entity] if ontology in config["ontology_types"]: ontologyType = config["ontology_types"][ontology] try: result, mappings, extra_entities, extra_rels = parse_ontology( ontology, download) if result is not None: terms, relationships, definitions = result for namespace in terms: if namespace in config["entities"]: name = config["entities"][namespace] entity_outputfile = os.path.join( import_directory, name + ".tsv") with open(entity_outputfile, 'w', encoding='utf-8') as csvfile: writer = csv.writer(csvfile, delimiter='\t', escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow([ 'ID', ':LABEL', 'name', 'description', 'type', 'synonyms' ]) num_terms = 0 for term in terms[namespace]: writer.writerow([ term, entity, list(terms[namespace][term])[0], definitions[term], ontologyType, ",".join(terms[namespace][term]) ]) num_terms += 1 for extra_entity in extra_entities: writer.writerow(list(extra_entity)) num_terms += 1 logger.info( "Ontology {} - Number of {} entities: {}".format( ontology, name, num_terms)) stats.add( builder_utils.buildStats(num_terms, "entity", name, ontology, entity_outputfile, updated_on)) if namespace in relationships: relationships_outputfile = os.path.join( import_directory, name + "_has_parent.tsv") relationships[namespace].update(extra_rels) relationshipsDf = pd.DataFrame( list(relationships[namespace])) relationshipsDf.columns = [ 'START_ID', 'END_ID', 'TYPE' ] relationshipsDf.to_csv( path_or_buf=relationships_outputfile, sep='\t', header=True, index=False, quotechar='"', quoting=csv.QUOTE_ALL, line_terminator='\n', escapechar='\\') logger.info( "Ontology {} - Number of {} relationships: {}". format(ontology, name + "_has_parent", len(relationships[namespace]))) stats.add( builder_utils.buildStats( len(relationships[namespace]), "relationships", name + "_has_parent", ontology, relationships_outputfile, updated_on)) else: logger.warning( "Ontology {} - The parsing did not work".format(ontology)) if mappings is not None: for name in mappings: mappings_outputfile = os.path.join(import_directory, name + ".tsv") mappingsDf = pd.DataFrame(list(mappings[name])) mappingsDf.columns = ['START_ID', 'END_ID', 'TYPE'] mappingsDf.to_csv(path_or_buf=mappings_outputfile, sep='\t', header=True, index=False, quotechar='"', quoting=csv.QUOTE_ALL, line_terminator='\n', escapechar='\\') logger.info( "Ontology {} - Number of {} relationships: {}".format( ontology, name, len(mappings[name]))) stats.add( builder_utils.buildStats(len(mappings[name]), "relationships", name, ontology, mappings_outputfile, updated_on)) except Exception as err: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error( "Error: {}. Ontology {}: {}, file: {},line: {}".format( err, ontology, sys.exc_info(), fname, exc_tb.tb_lineno)) return stats