Esempio n. 1
0
def parse_fasta(databases_directory,
                config,
                import_directory,
                download=True,
                updated_on=None):
    stats = set()
    url = config['uniprot_fasta_file']
    entities_output_file = os.path.join(import_directory,
                                        "Amino_acid_sequence.tsv")
    rel_output_file = os.path.join(
        import_directory, "Protein_HAS_Sequence_Amino_acid_sequence.tsv")

    directory = os.path.join(databases_directory, "UniProt")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])

    if download:
        builder_utils.downloadDB(url, directory)

    ff = builder_utils.read_gzipped_file(file_name)
    records = builder_utils.parse_fasta(ff)
    num_entities = 0
    with open(entities_output_file, 'w', encoding='utf-8') as ef:
        ef.write('ID\theader\tsequence\tsize\tsource\n')
        with open(rel_output_file, 'w', encoding='utf-8') as rf:
            rf.write('START_ID\tEND_ID\tTYPE\tsource\n')
            for i, batch in enumerate(
                    builder_utils.batch_iterator(records, 1000)):
                for record in batch:
                    identifier = record.id.split('|')[1]
                    header = record.id
                    sequence = str(record.seq)
                    sequence_len = len(str(sequence))
                    ef.write(identifier + "\t" + header + '\t' + sequence +
                             '\t' + str(sequence_len) + '\tUniProt\n')
                    rf.write(identifier + '\t' + identifier +
                             '\tHAS_SEQUENCE\tUniProt\n')
                    num_entities += 1

    stats.add(
        builder_utils.buildStats(num_entities, "entity", "Amino_acid_sequence",
                                 "UniProt", entities_output_file, updated_on))
    stats.add(
        builder_utils.buildStats(num_entities, "relationships", "HAS_SEQUENCE",
                                 "UniProt", rel_output_file, updated_on))

    return stats
Esempio n. 2
0
def generate_graph_files(data, dataType, projectId, stats, ot='w', dataset_import_dir='experiments'):
    if dataType == '':
        outputfile = os.path.join(dataset_import_dir, projectId+".tsv")
    else:
        outputfile = os.path.join(dataset_import_dir, projectId+"_"+dataType.lower()+".tsv")

    with open(outputfile, ot, encoding="utf-8") as f:
        data.to_csv(path_or_buf=f, sep='\t',
                    header=True, index=False, quotechar='"',
                    line_terminator='\n', escapechar='\\')

    logger.info("Experiment {} - Number of {} relationships: {}".format(projectId, dataType, data.shape[0]))
    stats.add(builder_utils.buildStats(data.shape[0], "relationships", dataType, "Experiment", outputfile))
Esempio n. 3
0
def print_single_file(data, header, output_file, data_type, data_object,
                      is_first, updated_on):
    stats = set()
    df = pd.DataFrame(list(data), columns=header)
    stats.add(
        builder_utils.buildStats(len(data), data_type, data_object, "UniProt",
                                 output_file, updated_on))
    with open(output_file, 'a', encoding='utf-8') as ef:
        df.to_csv(path_or_buf=ef,
                  sep='\t',
                  header=is_first,
                  index=False,
                  quotechar='"',
                  line_terminator='\n',
                  escapechar='\\')

    return stats
Esempio n. 4
0
def print_multiple_relationships_files(data, header, output_dir, is_first,
                                       updated_on):
    stats = set()
    for entity, relationship in data:
        df = pd.DataFrame(list(data[(entity, relationship)]), columns=header)
        output_file = os.path.join(
            output_dir, entity + "_" + relationship.lower() + ".tsv")
        stats.add(
            builder_utils.buildStats(len(data[(entity, relationship)]),
                                     'relationships', relationship, "UniProt",
                                     output_file, updated_on))
        with open(output_file, 'a', encoding='utf-8') as ef:
            df.to_csv(path_or_buf=ef,
                      sep='\t',
                      header=is_first,
                      index=False,
                      quotechar='"',
                      line_terminator='\n',
                      escapechar='\\')

    return stats
Esempio n. 5
0
def parseDatabase(importDirectory, database, download=True):
    stats = set()
    updated_on = None
    if download:
        updated_on = str(date.today())
    try:
        logger.info("Parsing database {}".format(database))
        database_directory = ckg_config['databases_directory']
        if database.lower() == "jensenlab":
            result = jensenlabParser.parser(database_directory, download)
            for qtype in result:
                relationships, header, outputfileName = result[qtype]
                outputfile = os.path.join(importDirectory, outputfileName)
                builder_utils.write_relationships(relationships, header,
                                                  outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, qtype, len(relationships)))
                stats.add(
                    builder_utils.buildStats(len(relationships),
                                             "relationships", qtype, database,
                                             outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "mentions":
            num_entities, outputfile = textminingParser.parser(
                database_directory, importDirectory, download)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Publication", num_entities))
            stats.add(
                builder_utils.buildStats(num_entities, "entity", "Publication",
                                         database, outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "hgnc":
            #HGNC
            entities, header = hgncParser.parser(database_directory, download)
            outputfile = os.path.join(importDirectory, "Gene.tsv")
            builder_utils.write_entities(entities, header, outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Gene", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Gene",
                                         database, outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "refseq":
            entities, relationships, headers = refseqParser.parser(
                database_directory, download)
            for entity in entities:
                header = headers[entity]
                outputfile = os.path.join(importDirectory, entity + ".tsv")
                builder_utils.write_entities(entities[entity], header,
                                             outputfile)
                logger.info("Database {} - Number of {} entities: {}".format(
                    database, entity, len(entities[entity])))
                stats.add(
                    builder_utils.buildStats(len(entities[entity]), "entity",
                                             entity, database, outputfile,
                                             updated_on))
            for rel in relationships:
                header = headers[rel]
                outputfile = os.path.join(importDirectory,
                                          "refseq_" + rel.lower() + ".tsv")
                builder_utils.write_relationships(relationships[rel], header,
                                                  outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, rel, len(relationships[rel])))
                stats.add(
                    builder_utils.buildStats(len(relationships[rel]),
                                             "relationships", rel, database,
                                             outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "uniprot":
            #UniProt
            stats.update(
                uniprotParser.parser(database_directory, importDirectory,
                                     download, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "pfam":
            #UniProt
            stats.update(
                pfamParser.parser(database_directory, importDirectory,
                                  download, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "intact":
            #IntAct
            relationships, header, outputfileName = intactParser.parser(
                database_directory, download)
            outputfile = os.path.join(importDirectory, outputfileName)
            builder_utils.write_relationships(relationships, header,
                                              outputfile)
            logger.info("Database {} - Number of {} relationships: {}".format(
                database, "curated_interacts_with", len(relationships)))
            stats.add(
                builder_utils.buildStats(len(relationships), "relationships",
                                         "curated_interacts_with", database,
                                         outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "mutationds":
            #MutationDs
            relationships, header, outputfileName = mutationDsParser.parser(
                database_directory, download)
            outputfile = os.path.join(importDirectory, outputfileName)
            builder_utils.write_relationships(relationships, header,
                                              outputfile)
            logger.info("Database {} - Number of {} relationships: {}".format(
                database, "curated_affects_interaction_with",
                len(relationships)))
            stats.add(
                builder_utils.buildStats(len(relationships), "relationships",
                                         "curated_affects_interaction_with",
                                         database, outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "string":
            #STRING
            proteinMapping, drugMapping = stringParser.parser(
                database_directory, importDirectory, download=download)
            stringParser.parseActions(database_directory,
                                      importDirectory,
                                      proteinMapping,
                                      drugMapping,
                                      download=download,
                                      db="STRING")
            print("Done Parsing database {}".format(database))
        elif database.lower() == "stitch":
            #STITCH
            proteinMapping, drugMapping = stringParser.parser(
                database_directory,
                importDirectory,
                drug_source=dbconfig["sources"]["Drug"],
                download=download,
                db="STITCH")
            stringParser.parseActions(database_directory,
                                      importDirectory,
                                      proteinMapping,
                                      drugMapping,
                                      download=download,
                                      db="STITCH")
            print("Done Parsing database {}".format(database))
        elif database.lower() == "disgenet":
            #DisGeNet
            relationships, header, outputfileName = disgenetParser.parser(
                database_directory, download)
            for idType in relationships:
                outputfile = os.path.join(importDirectory,
                                          idType + "_" + outputfileName)
                builder_utils.write_relationships(relationships[idType],
                                                  header, outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, idType, len(relationships[idType])))
                stats.add(
                    builder_utils.buildStats(len(relationships[idType]),
                                             "relationships", idType, database,
                                             outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "pathwaycommons":
            #PathwayCommons pathways
            entities, relationships, entities_header, relationships_header = pathwayCommonsParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(importDirectory, "Pathway.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Pathway",
                                         database, entity_outputfile,
                                         updated_on))
            pathway_outputfile = os.path.join(
                importDirectory,
                "pathwaycommons_protein_associated_with_pathway.tsv")
            builder_utils.write_relationships(relationships,
                                              relationships_header,
                                              pathway_outputfile)
            logger.info("Database {} - Number of {} relationships: {}".format(
                database, "protein_associated_with_pathway",
                len(relationships)))
            stats.add(
                builder_utils.buildStats(len(relationships), "relationships",
                                         "protein_associated_with_pathway",
                                         database, pathway_outputfile,
                                         updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "reactome":
            #Reactome
            entities, relationships, entities_header, relationships_header = reactomeParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(importDirectory,
                                             database.lower() + "_Pathway.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Pathway",
                                         database, entity_outputfile,
                                         updated_on))
            for entity, relationship in relationships:
                reactome_outputfile = os.path.join(
                    importDirectory,
                    database.lower() + "_" + entity.lower() + "_" +
                    relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)],
                    relationships_header[entity], reactome_outputfile)
                logger.info(
                    "Database {} - Number of {} {} relationships: {}".format(
                        database, entity, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database,
                        reactome_outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "smpdb":
            #SMPDB
            entities, relationships, entities_header, relationships_header = smpdbParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(importDirectory,
                                             database.lower() + "_Pathway.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Pathway",
                                         database, entity_outputfile,
                                         updated_on))
            for entity, relationship in relationships:
                smpdb_outputfile = os.path.join(
                    importDirectory,
                    database.lower() + "_" + entity.lower() + "_" +
                    relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)],
                    relationships_header[entity], smpdb_outputfile)
                logger.info(
                    "Database {} - Number of {} {} relationships: {}".format(
                        database, entity, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database,
                        smpdb_outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "dgidb":
            relationships, header, outputfileName = drugGeneInteractionDBParser.parser(
                database_directory, download)
            outputfile = os.path.join(importDirectory, outputfileName)
            builder_utils.write_relationships(relationships, header,
                                              outputfile)
            logger.info("Database {} - Number of {} relationships: {}".format(
                database, "targets", len(relationships)))
            stats.add(
                builder_utils.buildStats(len(relationships), "relationships",
                                         "targets", database, outputfile,
                                         updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "sider":
            relationships, header, outputfileName, drugMapping, phenotypeMapping = siderParser.parser(
                database_directory, dbconfig["sources"]["Drug"], download)
            outputfile = os.path.join(importDirectory, outputfileName)
            builder_utils.write_relationships(relationships, header,
                                              outputfile)
            logger.info("Database {} - Number of {} relationships: {}".format(
                database, "has_side_effect", len(relationships)))
            stats.add(
                builder_utils.buildStats(len(relationships), "relationships",
                                         "has_side_effect", database,
                                         outputfile, updated_on))
            relationships, header, outputfileName = siderParser.parserIndications(
                database_directory,
                drugMapping,
                phenotypeMapping,
                download=download)
            outputfile = os.path.join(importDirectory, outputfileName)
            builder_utils.write_relationships(relationships, header,
                                              outputfile)
            logger.info("Database {} - Number of {} relationships: {}".format(
                database, "indicated_for", len(relationships)))
            stats.add(
                builder_utils.buildStats(len(relationships), "relationships",
                                         "indicated_for", database, outputfile,
                                         updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "oncokb":
            entities, relationships, entities_header, relationships_headers = oncokbParser.parser(
                database_directory, download)
            outputfile = os.path.join(
                importDirectory, "oncokb_Clinically_relevant_variant.tsv")
            builder_utils.write_entities(entities, entities_header, outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Clinically_relevant_variant", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity",
                                         "Clinically_relevant_variant",
                                         database, outputfile, updated_on))
            for relationship in relationships:
                oncokb_outputfile = os.path.join(
                    importDirectory, "oncokb_" + relationship + ".tsv")
                if relationship in relationships_headers:
                    header = relationships_headers[relationship]
                else:
                    header = ['START_ID', 'END_ID', 'TYPE']
                builder_utils.write_relationships(relationships[relationship],
                                                  header, oncokb_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[relationship])))
                stats.add(
                    builder_utils.buildStats(len(relationships[relationship]),
                                             "relationships", relationship,
                                             database, outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "cancergenomeinterpreter":
            entities, relationships, entities_header, relationships_headers = cancerGenomeInterpreterParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(
                importDirectory, "cgi_Clinically_relevant_variant.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Clinically_relevant_variant", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity",
                                         "Clinically_relevant_variant",
                                         database, entity_outputfile,
                                         updated_on))
            for relationship in relationships:
                cgi_outputfile = os.path.join(importDirectory,
                                              "cgi_" + relationship + ".tsv")
                header = ['START_ID', 'END_ID', 'TYPE']
                if relationship in relationships_headers:
                    header = relationships_headers[relationship]
                builder_utils.write_relationships(relationships[relationship],
                                                  header, cgi_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[relationship])))
                stats.add(
                    builder_utils.buildStats(len(relationships[relationship]),
                                             "relationships", relationship,
                                             database, cgi_outputfile,
                                             updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "hmdb":
            entities, relationships, entities_header, relationships_header = hmdbParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(importDirectory, "Metabolite.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Metabolite", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Metabolite",
                                         database, entity_outputfile,
                                         updated_on))
            for relationship in relationships:
                hmdb_outputfile = os.path.join(importDirectory,
                                               relationship + ".tsv")
                builder_utils.write_relationships(relationships[relationship],
                                                  relationships_header,
                                                  hmdb_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[relationship])))
                stats.add(
                    builder_utils.buildStats(len(relationships[relationship]),
                                             "relationships", relationship,
                                             database, hmdb_outputfile,
                                             updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "drugbank":
            entities, relationships, entities_header, relationships_headers = drugBankParser.parser(
                database_directory)
            entity_outputfile = os.path.join(importDirectory, "Drug.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Drug", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Drug",
                                         database, entity_outputfile,
                                         updated_on))
            for relationship in relationships:
                relationship_outputfile = os.path.join(importDirectory,
                                                       relationship + ".tsv")
                header = ['START_ID', 'END_ID', 'TYPE', 'source']
                if relationship in relationships_headers:
                    header = relationships_headers[relationship]
                builder_utils.write_relationships(relationships[relationship],
                                                  header,
                                                  relationship_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[relationship])))
                stats.add(
                    builder_utils.buildStats(len(relationships[relationship]),
                                             "relationships", relationship,
                                             database, relationship_outputfile,
                                             updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "gwascatalog":
            entities, relationships, entities_header, relationships_header = gwasCatalogParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(importDirectory, "GWAS_study.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "GWAS_study", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "GWAS_study",
                                         database, entity_outputfile,
                                         updated_on))
            for relationship in relationships:
                header = ['START_ID', 'END_ID', 'TYPE', 'source']
                if relationship in relationships_header:
                    header = relationships_header[relationship]
                outputfile = os.path.join(
                    importDirectory, "GWAS_study_" + relationship + ".tsv")
                builder_utils.write_relationships(relationships[relationship],
                                                  header, outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[relationship])))
                stats.add(
                    builder_utils.buildStats(len(relationships[relationship]),
                                             "relationships", relationship,
                                             database, outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "phosphositeplus":
            entities, relationships, entities_header, relationships_headers = pspParser.parser(
                database_directory)
            entity_outputfile = os.path.join(importDirectory,
                                             "psp_Modified_protein.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Modified_protein", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity",
                                         "Modified_protein", database,
                                         entity_outputfile, updated_on))
            for entity, relationship in relationships:
                rel_header = ["START_ID", "END_ID", "TYPE", "source"]
                if entity in relationships_headers:
                    rel_header = relationships_headers[entity]
                outputfile = os.path.join(
                    importDirectory, "psp_" + entity.lower() + "_" +
                    relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)], rel_header,
                    outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database, outputfile,
                        updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "signor":
            entities, relationships, entities_header, relationships_headers = signorParser.parser(
                database_directory)
            entity_outputfile = os.path.join(importDirectory,
                                             "signor_Modified_protein.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Modified_protein", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity",
                                         "Modified_protein", database,
                                         entity_outputfile, updated_on))
            for entity, relationship in relationships:
                rel_header = ["START_ID", "END_ID", "TYPE", "source"]
                prefix = 'signor_' + entity.lower()
                if relationship in relationships_headers:
                    rel_header = relationships_headers[relationship]
                if relationship == 'mentioned_in_publication':
                    prefix = entity
                outputfile = os.path.join(
                    importDirectory,
                    prefix + "_" + relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)], rel_header,
                    outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database, outputfile,
                        updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "corum":
            entities, relationships, entities_header, relationships_headers = corumParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(importDirectory, "Complex.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Complex", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Complex",
                                         database, entity_outputfile,
                                         updated_on))
            for entity, relationship in relationships:
                corum_outputfile = os.path.join(
                    importDirectory,
                    database.lower() + "_" + entity.lower() + "_" +
                    relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)],
                    relationships_headers[entity], corum_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database,
                        corum_outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "foodb":
            entities, relationships, entities_header, relationships_headers = foodbParser.parser(
                database_directory, download)
            entity_outputfile = os.path.join(importDirectory, "Food.tsv")
            builder_utils.write_entities(entities, entities_header,
                                         entity_outputfile)
            logger.info("Database {} - Number of {} entities: {}".format(
                database, "Food", len(entities)))
            stats.add(
                builder_utils.buildStats(len(entities), "entity", "Food",
                                         database, entity_outputfile,
                                         updated_on))
            for entity, relationship in relationships:
                foodb_outputfile = os.path.join(
                    importDirectory,
                    database.lower() + "_" + entity.lower() + "_" +
                    relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)],
                    relationships_headers[entity], foodb_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database,
                        foodb_outputfile, updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "exposome explorer":
            relationships, header = exposomeParser.parser(
                database_directory, download)
            for entity, relationship in relationships:
                ee_outputfile = os.path.join(
                    importDirectory,
                    database.lower() + "_" + entity.lower() + "_" +
                    relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)], header[entity],
                    ee_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database, ee_outputfile,
                        updated_on))
            print("Done Parsing database {}".format(database))
        elif database.lower() == "hpa":
            relationships, headers = hpaParser.parser(database_directory,
                                                      download)
            for entity, relationship in relationships:
                hpa_outputfile = os.path.join(
                    importDirectory,
                    database.lower() + "_" + entity.lower() + "_" +
                    relationship.lower() + ".tsv")
                builder_utils.write_relationships(
                    relationships[(entity, relationship)],
                    headers[relationship], hpa_outputfile)
                logger.info(
                    "Database {} - Number of {} relationships: {}".format(
                        database, relationship,
                        len(relationships[(entity, relationship)])))
                stats.add(
                    builder_utils.buildStats(
                        len(relationships[(entity, relationship)]),
                        "relationships", relationship, database,
                        hpa_outputfile, updated_on))
            print("Done Parsing database {}".format(database))
    except Exception as err:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Database {}: {}, file: {},line: {}".format(
            database, sys.exc_info(), fname, exc_tb.tb_lineno))
    return stats
Esempio n. 6
0
def parser(databases_directory,
           import_directory,
           download=True,
           updated_on=None):
    config = builder_utils.get_config(config_name="pfamConfig.yml",
                                      data_type='databases')
    entity_header = config['entity_header']
    relationship_headers = config['relationship_headers']

    directory = os.path.join(databases_directory, 'Pfam')
    builder_utils.checkDirectory(directory)
    protein_mapping = mp.getMappingForEntity(entity="Protein")
    valid_proteins = list(set(protein_mapping.values()))

    ftp_url = config['ftp_url']
    filename = config['full_uniprot_file']
    # url = config['test']

    if not os.path.exists(os.path.join(directory, filename)):
        if download:
            builder_utils.downloadDB(ftp_url + filename, directory)

    stats = set()
    if os.path.exists(os.path.join(directory, filename)):
        fhandler = builder_utils.read_gzipped_file(
            os.path.join(directory, filename))
        identifier = None
        description = []
        lines = []
        missed = 0
        entities = set()
        relationships = defaultdict(set)
        is_first = True
        i = 0
        read_lines = 0
        num_entities = 0
        num_relationships = {}
        try:
            for line in fhandler:
                i += 1
                read_lines += 1
                if line.startswith("# STOCKHOLM"):
                    if identifier is not None:
                        entities.add((identifier, 'Functional_region', name,
                                      " ".join(description), "PFam"))
                        if len(entities) == 100:
                            print_files(entities,
                                        entity_header,
                                        outputfile=os.path.join(
                                            import_directory,
                                            'Functional_region.tsv'),
                                        is_first=is_first)
                            num_entities += len(entities)
                            if 'mentioned_in_publication' in relationships:
                                print_files(
                                    relationships['mentioned_in_publication'],
                                    relationship_headers[
                                        'mentioned_in_publication'],
                                    outputfile=os.path.join(
                                        import_directory,
                                        'Functional_region_mentioned_in_publication.tsv'
                                    ),
                                    is_first=is_first)
                                if 'mentioned_in_publication' not in num_relationships:
                                    num_relationships[
                                        'mentioned_in_publication'] = 0
                                num_relationships[
                                    'mentioned_in_publication'] += len(
                                        relationships[
                                            'mentioned_in_publication'])
                            if 'found_in_protein' in relationships:
                                print_files(
                                    relationships['found_in_protein'],
                                    relationship_headers['found_in_protein'],
                                    outputfile=os.path.join(
                                        import_directory,
                                        'Functional_region_found_in_protein.tsv'
                                    ),
                                    is_first=is_first,
                                    filter_for=('END_ID', valid_proteins))
                                if 'found_in_protein' not in num_relationships:
                                    num_relationships['found_in_protein'] = 0
                                num_relationships['found_in_protein'] += len(
                                    relationships['found_in_protein'])
                            entities = set()
                            relationships = defaultdict(set)
                            is_first = False
                        identifier = None
                        description = []
                elif line.startswith("#=GF"):
                    data = line.rstrip('\r\n').split()
                    if 'AC' in data:
                        identifier = data[2].split('.')[0]
                    elif 'DE' in data:
                        name = " ".join(data[2:])
                    elif 'RM' in data:
                        relationships['mentioned_in_publication'].add(
                            (identifier, data[2], "MENTIONED_IN_PUBLICATION",
                             "PFam"))
                    elif 'CC' in data:
                        description.append(" ".join(data[2:]))
                elif not line.startswith('//'):
                    data = line.rstrip('\r\n').split()
                    protein, positions = data[0].split('/')
                    protein = protein.replace('.', '-')
                    start, end = positions.split('-')
                    sequence = data[1]
                    relationships['found_in_protein'].add(
                        (identifier, protein, "FOUND_IN_PROTEIN", start, end,
                         sequence, "PFam"))
                    if protein.split('-')[0] != protein:
                        relationships['found_in_protein'].add(
                            (identifier, protein.split('-')[0],
                             "FOUND_IN_PROTEIN", start, end, sequence, "PFam"))
        except UnicodeDecodeError:
            lines.append(i)
            missed += 1

        fhandler.close()

        if len(entities) > 0:
            print_files(entities,
                        entity_header,
                        outputfile=os.path.join(import_directory,
                                                'Functional_region.tsv'),
                        is_first=is_first)
            num_entities += len(entities)
            print_files(relationships['mentioned_in_publication'],
                        relationship_headers['mentioned_in_publication'],
                        outputfile=os.path.join(
                            import_directory,
                            'Functional_region_mentioned_in_publication.tsv'),
                        is_first=is_first)
            num_relationships['mentioned_in_publication'] += len(
                relationships['mentioned_in_publication'])
            print_files(relationships['found_in_protein'],
                        relationship_headers['found_in_protein'],
                        outputfile=os.path.join(
                            import_directory,
                            'Functional_region_found_in_protein.tsv'),
                        is_first=is_first)
            num_relationships['found_in_protein'] += len(
                relationships['found_in_protein'])

        stats.add(
            builder_utils.buildStats(num_entities, "entity",
                                     "Functional_region", "Pfam",
                                     'Functional_region.tsv', updated_on))

        for rel in num_relationships:
            stats.add(
                builder_utils.buildStats(num_relationships[rel],
                                         "relationship", rel.upper(), "Pfam",
                                         'Functional_region_' + rel + '.tsv',
                                         updated_on))

    builder_utils.remove_directory(directory)

    return stats
Esempio n. 7
0
def generate_graphFiles(import_directory, ontologies=None, download=True):
    """
    This function parses and extracts data from a given list of ontologies. If no ontologies are provided, \
    all availables ontologies are used. Terms, relationships and definitions are saved as .tsv files to be loaded into \
    the graph database.

    :param str import_directory: relative path from current python module to 'imports' directory.
    :param ontologies: list of ontologies to be imported. If None, all available ontologies are imported.
    :type ontologies: list or None
    :param bool download: wether database is to be downloaded.
    :return: Dictionary of tuples. Each tuple corresponds to a unique label/relationship type, date, time, \
            database, and number of nodes and relationships.
    """
    entities = config["ontologies"]
    if ontologies is not None:
        entities = {}
        for ontology in ontologies:
            ontology = ontology.capitalize()
            if ontology.capitalize() in config["ontologies"]:
                entities.update({ontology: config["ontologies"][ontology]})

    updated_on = "None"
    if download:
        updated_on = str(date.today())

    stats = set()
    for entity in entities:
        ontology = config["ontologies"][entity]
        if ontology in config["ontology_types"]:
            ontologyType = config["ontology_types"][ontology]
        try:
            result, mappings, extra_entities, extra_rels = parse_ontology(
                ontology, download)
            if result is not None:
                terms, relationships, definitions = result
                for namespace in terms:
                    if namespace in config["entities"]:
                        name = config["entities"][namespace]
                        entity_outputfile = os.path.join(
                            import_directory, name + ".tsv")
                        with open(entity_outputfile, 'w',
                                  encoding='utf-8') as csvfile:
                            writer = csv.writer(csvfile,
                                                delimiter='\t',
                                                escapechar='\\',
                                                quotechar='"',
                                                quoting=csv.QUOTE_ALL)
                            writer.writerow([
                                'ID', ':LABEL', 'name', 'description', 'type',
                                'synonyms'
                            ])
                            num_terms = 0
                            for term in terms[namespace]:
                                writer.writerow([
                                    term, entity,
                                    list(terms[namespace][term])[0],
                                    definitions[term], ontologyType,
                                    ",".join(terms[namespace][term])
                                ])
                                num_terms += 1
                            for extra_entity in extra_entities:
                                writer.writerow(list(extra_entity))
                                num_terms += 1
                        logger.info(
                            "Ontology {} - Number of {} entities: {}".format(
                                ontology, name, num_terms))
                        stats.add(
                            builder_utils.buildStats(num_terms, "entity", name,
                                                     ontology,
                                                     entity_outputfile,
                                                     updated_on))
                        if namespace in relationships:
                            relationships_outputfile = os.path.join(
                                import_directory, name + "_has_parent.tsv")
                            relationships[namespace].update(extra_rels)
                            relationshipsDf = pd.DataFrame(
                                list(relationships[namespace]))
                            relationshipsDf.columns = [
                                'START_ID', 'END_ID', 'TYPE'
                            ]
                            relationshipsDf.to_csv(
                                path_or_buf=relationships_outputfile,
                                sep='\t',
                                header=True,
                                index=False,
                                quotechar='"',
                                quoting=csv.QUOTE_ALL,
                                line_terminator='\n',
                                escapechar='\\')
                            logger.info(
                                "Ontology {} - Number of {} relationships: {}".
                                format(ontology, name + "_has_parent",
                                       len(relationships[namespace])))
                            stats.add(
                                builder_utils.buildStats(
                                    len(relationships[namespace]),
                                    "relationships", name + "_has_parent",
                                    ontology, relationships_outputfile,
                                    updated_on))
            else:
                logger.warning(
                    "Ontology {} - The parsing did not work".format(ontology))
            if mappings is not None:
                for name in mappings:
                    mappings_outputfile = os.path.join(import_directory,
                                                       name + ".tsv")
                    mappingsDf = pd.DataFrame(list(mappings[name]))
                    mappingsDf.columns = ['START_ID', 'END_ID', 'TYPE']
                    mappingsDf.to_csv(path_or_buf=mappings_outputfile,
                                      sep='\t',
                                      header=True,
                                      index=False,
                                      quotechar='"',
                                      quoting=csv.QUOTE_ALL,
                                      line_terminator='\n',
                                      escapechar='\\')
                    logger.info(
                        "Ontology {} - Number of {} relationships: {}".format(
                            ontology, name, len(mappings[name])))
                    stats.add(
                        builder_utils.buildStats(len(mappings[name]),
                                                 "relationships", name,
                                                 ontology, mappings_outputfile,
                                                 updated_on))
        except Exception as err:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error(
                "Error: {}. Ontology {}: {}, file: {},line: {}".format(
                    err, ontology, sys.exc_info(), fname, exc_tb.tb_lineno))
    return stats