Example #1
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    node_names_to_id = {}

    loadUniprotFile(UNIPROT_DATA_FILE)
    for files in os.listdir('PTM/databases/ELMpred/dssp/LAB/'):
        file_list.append('PTM/databases/ELMpred/dssp/LAB/' + files)
    i = 0
    for file in file_list:
        i += 1
        if i == 15000:
            break
        get_match(file)
    get_scores()
    get_protein_id()
    get_taxid()
    get_domain()
    logging.debug('Done creating elm map. Starting adding to DB structure')

    #SELECT elm_prot_id, domain_prot_id, taxid from elm_to_prot
    for m in ELMmaches:
        if len(m.domain_prot_id) > 0 and len(m.elm_prot_id) > 0:
            for m_elm_prot_id in m.elm_prot_id:
                for m_domain_prot_id in m.domain_prot_id:
                    # Creating the node dicts, if the node is already in the db assigning that to the node dict
                    source_dict = insert_or_get_node_dict(
                        m_elm_prot_id, "Uniprot", m.taxid, node_names_to_id,
                        db_api)
                    target_dict = insert_or_get_node_dict(
                        m_domain_prot_id, "Uniprot", m.taxid, node_names_to_id,
                        db_api)

                    # Nodes are inserted to the db if they are not in it yet
                    if 'id' not in source_dict:
                        db_api.insert_node(source_dict)

                    if 'id' not in target_dict:
                        db_api.insert_node(target_dict)

                    edge_dict = {
                        'publication_ids': 'pubmed:26615199',
                        'layer': '2',
                        'source_db': DB_TYPE,  # ontology database citation
                        'interaction_identifiers': None,
                        'confidence_scores': None,  # if available
                        'interaction_detection_method':
                        None,  # probably exp type
                        'interaction_types': 'MI:0190(interaction type)',
                        'first_author': None
                    }

                    db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #2
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    node_names_to_id = {}

    with open(DATA_FILE, encoding='ISO-8859-1') as data:

        # Skipping the header
        data.readline()
        data.readline()
        data.readline()
        data.readline()

        for line in data:
            columns = line.split('\t')

            if columns[3] == 'human' and columns[8] == 'human':
                taxid = 'taxid:9606'

                # Creating the node dicts, if the node is already in the db assigning that to the node dict
                source_dict = insert_or_get_node_dict(columns[2], "Uniprot",
                                                      taxid, node_names_to_id,
                                                      db_api)
                target_dict = insert_or_get_node_dict(columns[6], "Uniprot",
                                                      taxid, node_names_to_id,
                                                      db_api)

                # Nodes are inserted to the db if they are not in it yet
                if not 'id' in source_dict:
                    db_api.insert_node(source_dict)

                if not 'id' in target_dict:
                    db_api.insert_node(target_dict)

                interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                    % ('MI:0217(phosphorylation)', "true", 'false')

                edge_dict = {
                    'publication_ids': 'pubmed:22135298',
                    'layer': '2',
                    'source_db': DB_TYPE,
                    'interaction_identifiers': None,
                    'confidence_scores': None,  # if available
                    'interaction_detection_method': None,  # probably exp type
                    'interaction_types': interaction_types,
                    'first_author': None
                }

                db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #3
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    with open(DATA_FILE) as data:

        # Skipping the header
        data.readline()

        for line in data:
            columns = line.split('\t')
            taxid = 'taxid:9606'

            # Creating the node dicts, if the node is already in the db assigning that to the node dict
            source_dict = get_node_a(columns[0], taxid, db_api)
            target_dict = get_node_b(columns[2], taxid, db_api)

            # Nodes are inserted to the db if they are not in it yet
            if not 'id' in source_dict:
                db_api.insert_node(source_dict)

            if not 'id' in target_dict:
                db_api.insert_node(target_dict)

            interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                % ('MI:0190(interaction type)', "false", 'false')
            edge_dict = {
                'publication_ids': 'pubmed:20005715',
                'layer': '1',
                'source_db': DB_TYPE,
                'interaction_identifiers': None,
                'confidence_scores': None,
                'interaction_detection_method': None,
                'interaction_types': interaction_types,
                'first_author': None
            }

            db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #4
0
def main(logger):
    # Declaring variables and constants
    inserted_nodes = {}
    UNIPROT_TO_TAX_ID_MAP = get_taxid_map_dict(TAX_ID_MAP_FILE_LOCATION)

    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    SLK_21_FILE = csv.reader(open(SLK_21_FILE_LOCATION, encoding="ISO-8859-1"), delimiter='\t', quotechar='"')
    # Skipping the header
    next(SLK_21_FILE)

    node_names_to_id = {}

    for row in SLK_21_FILE:

        mitab_source_pathways = get_mitab_pathways_list_string(row[1])
        mitab_target_pathways = get_mitab_pathways_list_string(row[4])

        if (row[0] not in UNIPROT_TO_TAX_ID_MAP) or (row[3] not in UNIPROT_TO_TAX_ID_MAP):
            continue

        # Creating the node dicts, if the node is already in the db assigning that to the node dict
        source_dict = insert_or_get_node_dict(UNIPROT_TO_TAX_ID_MAP, row[0], mitab_source_pathways, row[2], node_names_to_id, db_api)
        target_dict = insert_or_get_node_dict(UNIPROT_TO_TAX_ID_MAP, row[3], mitab_target_pathways, row[5], node_names_to_id, db_api)


        effect = EFFECT_MAP[row[8]]

        is_direct = IS_DIRECT_MAP[row[6].lower()]
        if "MI:0407(direct interaction)" in is_direct:
            is_direct = "true"
        else:
            is_direct = "false"

        is_directed = IS_DIRECTED_MAP[row[7].lower()]
        if is_directed == "directed":
            is_directed = "true"
        else:
            is_directed = "false"

        edge_dict = {
            'interaction_detection_method' : None,
            'first_author' : None,
            'publication_ids' : get_mitab_publication_list_string(row[9]),
            'interaction_types' : "%s|is_directed:%s|is_direct:%s" % (effect, is_directed, is_direct),
            'source_db' : 'SLKv2.1',
            'interaction_identifiers' : None,
            'confidence_scores' : None,
            'layer' : "8"
        }

        db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
Example #5
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    with open(DATA_FILE) as data:
        # Skipping the header
        data.readline()
        node_names_to_id = {}

        for line in data:
            columns = line.strip().split('\t')
            if len(columns) != 1:
                if columns[9] == '9606' or columns[9] == '7227':

                    mirbase_id = ("hsa-"+columns[0]) if columns[9] == '9606' else ("dme-"+columns[0])
                    # there are two malformed ID in the database:
                    # hsa-miR-149*       -->  hsa-miR-149
                    # hsa-"miR-34a,b,c"  -->  hsa-"miR-34"
                    mirbase_id = mirbase_id.replace('*', '').replace('\"', '').replace('a,b,c', '')

                    source_dict = insert_or_get_node_dict(mirbase_id, 'miRBase', 'taxid:' + columns[9], node_names_to_id, db_api)
                    target_dict = insert_or_get_node_dict(columns[3], 'GeneCards', 'taxid:' + columns[9], node_names_to_id, db_api)

                    interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)"

                    pubmed_ids = ['22743998']  # miRDeathDB publication
                    if len(columns[8].strip()) > 0:
                        pubmed_ids.append(columns[8].strip())
                    pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids))

                    # Inserting edges
                    edge_dict = {
                            'publication_ids': "|".join(pubmed_ids),
                            'layer': '5',
                            'source_db': 'miRDeathDB',
                            'interaction_identifiers': None,
                            'confidence_scores': None,
                            'interaction_detection_method': None,
                            'interaction_types': interaction_types,
                            'first_author': None
                        }

                    db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
    print("miRDeathDB finished")
Example #6
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    with open(DATA_FILE) as data:

        node_names_to_id = {}

        for line in data:
            columns = line.split('\t')
            lnc_name = columns[0].split('|')[3]
            taxid = 'taxid:9606'

            # Creating the node dicts, if the node is already in the db assigning that to the node dict
            source_dict = insert_or_get_node_dict(lnc_name, taxid,
                                                  node_names_to_id, db_api)
            target_dict = insert_or_get_node_dict(columns[2], taxid,
                                                  node_names_to_id, db_api)

            interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                % ('MI:2247(trascriptional regulation)', 'true', 'false')
            edge_dict = {
                'publication_ids': 'pubmed:28591841|pubmed:29140473',
                'layer': '6',
                'source_db': 'PSSMprediction',
                'interaction_identifiers': None,
                'confidence_scores': None,
                'interaction_detection_method':
                'MI:1176(sequence based prediction of gene regulatory region binding sites)',
                'interaction_types': interaction_types,
                'first_author': None
            }

            db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #7
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    dest_conn = sqlite3.connect(DB_DESTINATION)

    # Parsing file
    with open(DATA_FILE, encoding='ISO-8859-1') as data:
        data.readline()
        for line in data:
            line = line.strip().split(',')

            source_uniprot = line[2]
            target_uniprot = line[3]

            source_genename = line[0]
            target_genename = line[1]
            # Creating the node dicts, if the node is already in the db assigning that to the node dict
            source_dict = get_node_a('Uniprot:' + source_uniprot, 'taxid:9606',
                                     source_genename, db_api)
            target_dict = get_node_b('Uniprot:' + target_uniprot, 'taxid:9606',
                                     target_genename, db_api)

            # Nodes are inserted to the db if they are not in it yet
            if 'id' not in source_dict:
                db_api.insert_node(source_dict)

            if 'id' not in target_dict:
                db_api.insert_node(target_dict)

            # Layer mapping
            layer_map_dict = {'Interaction between autophagy proteins': 0}
            # Effect mapping
            effect_map_dict = {
                'unknown': 'unknown',
                "stimulation": 'MI:0624(stimulation)',
            }
            # Directedness mapping
            direct_map_dict = {
                "direct": "MI:0407(directed)",
                "indirect": "MI:2246(indirect)",
            }
            # Setting up identifiers
            directness = direct_map_dict[line[5]]
            effect = effect_map_dict[line[6]]
            # Assembling line
            ident = ('effect:' + effect + '|is_direct:' + directness)

            # Publications
            pubs = '|pubmed:'.join(line[7].split('|'))

            # Sourcedb
            sourcedb = line[8].split('(')[0].replace('"', '')

            source_map = {
                'BioGRID': 'TheBiogrid',
                'Behrends et Al. 2010': 'Behrends'
            }

            if sourcedb in source_map:
                source = source_map[sourcedb]
            else:
                source = sourcedb

            edge_dict = {
                'publication_ids': 'pubmed:' + pubs,
                'layer': layer_map_dict[line[4]],
                'source_db': source,
                'interaction_identifiers': ident,
                'confidence_scores': None,
                'interaction_detection_method': None,
                'interaction_types': None,
                'first_author': None
            }

            db_api.insert_edge(source_dict, target_dict, edge_dict)

            # Saving the to a DB_TYPE.db file
        db_api.save_db_to_file(DB_DESTINATION)
Example #8
0
def main(logger):

    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    all_data = []
    pubmed_id_map = {}

    if os.path.exists('./pubmed_id_map_cache_for_miR2Disease.json'):
        with open("./pubmed_id_map_cache_for_miR2Disease.json") as cache:
            pubmed_id_map = json.load(cache)

    with open(DATA_FILE, encoding='ISO-8859-1') as data:
        # Skipping the header
        data.readline()
        data.readline()
        data.readline()
        node_names_to_id = {}
        lines = 0

        for line in data:
            columns = line.split('\t')
            if len(columns) > 1:
                lines += 1
                if lines % 50 == 0:
                    print("processed lines (miR2Disease): %d" % lines)

                columns[3] = columns[3].strip()

                if not is_well_formed_id(
                        columns[0].strip()) or not is_well_formed_id(
                            columns[1].strip()):
                    print("Warning: malformed ID, link skipped")
                    continue

                all_data.append(columns)

                if columns[3] not in pubmed_id_map:
                    search_term = columns[3].replace(".", ' ').replace(
                        ' and ', ' ').replace(' or ', ' ').replace("'",
                                                                   '').strip()
                    search_term = "%s[pdat] AND %s" % (columns[2].strip(),
                                                       search_term)
                    search_term = urllib.parse.quote(search_term, safe='')
                    URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&report=uilist&retmode=json&term=' + search_term
                    resp = requests.get(URL)
                    pubmed_id_list = json.loads(
                        resp.text)['esearchresult']['idlist']

                    if pubmed_id_list and len(pubmed_id_list) == 1:
                        pubmed_id_map[columns[3]] = pubmed_id_list[0]
                    else:
                        print("WARNING: pmid not found")
                        # print("         pubmed ID list: %s" % str(pubmed_id_list))
                        # print("         %s %s" % (columns[2], columns[3]))
                        # print("         " + URL)
                        pubmed_id_map[columns[3]] = None

        print("processed lines (miR2Disease): %d" % lines)

    print("saving output db")
    for columns in all_data:

        source_dict = insert_or_get_node_dict(columns[0], 'miRBase',
                                              'taxid:9606', node_names_to_id,
                                              db_api)
        target_dict = insert_or_get_node_dict(columns[1], 'GeneCards',
                                              'taxid:9606', node_names_to_id,
                                              db_api)

        # Getting files from the web with a custom URL
        pubmed_ids = ['18927107']  # mir2Disease publication
        if columns[3] in pubmed_id_map and pubmed_id_map[columns[3]]:
            pubmed_ids.append(str(pubmed_id_map[columns[3]]).strip())
        pubmed_ids = set(map(lambda x: ("pubmed:" + x).strip(), pubmed_ids))

        interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)"

        # Inserting edges
        edge_dict = {
            'publication_ids': "|".join(pubmed_ids),
            'layer': '5',
            'source_db': 'miR2Disease',
            'interaction_identifiers': None,
            'confidence_scores': None,
            'interaction_detection_method': None,
            'interaction_types': interaction_types,
            'first_author': None
        }

        db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)

    # save pubmed_id_map, so that we can re-use next time
    with open("./pubmed_id_map_cache_for_miR2Disease.json", 'w') as cache:
        json.dump(pubmed_id_map, cache, indent=4, sort_keys=True)
Example #9
0
def main(logger):
    db_api = PsimiSQL(SQL_SEED)

    node_names_to_id = {}
    with open(DATA_FILE, encoding='ISO-8859-1') as data:

        lines = 0

        for line in data:
            lines += 1
            if lines % 50000 == 0:
                print("processed lines: %d" % lines)

            columns = line.strip().split('\t')
            if columns[-1] == 'RNA-RNA':
                if columns[12] in SPECIES_DICT:

                    tax_id = 'taxid:' + SPECIES_DICT[columns[12]]['tax_id']

                    id_type_map = {
                        'NONCODE': {
                            'id_type': 'NONCODE',
                            'use_name': False
                        },
                        'miRBase': {
                            'id_type': 'miRBase',
                            'use_name': True
                        },
                        'UniProt': {
                            'id_type': 'Uniprot',
                            'use_name': False
                        },
                        'UniGene': {
                            'id_type': 'GeneCards',
                            'use_name': True
                        },
                        'RefSeq': {
                            'id_type': 'RefSeq',
                            'use_name': False
                        },
                    }

                    source_id = fix_id(columns[2].strip(), columns[3].strip(),
                                       columns[4].strip(),
                                       SPECIES_DICT[columns[12]], id_type_map)
                    target_id = fix_id(columns[6].strip(), columns[7].strip(),
                                       columns[4].strip(),
                                       SPECIES_DICT[columns[12]], id_type_map)

                    if not source_id or not target_id:
                        continue

                    source_dict = insert_or_get_node_dict(
                        source_id, tax_id, node_names_to_id, db_api)
                    target_dict = insert_or_get_node_dict(
                        target_id, tax_id, node_names_to_id, db_api)

                    interaction_types = "MI:0407(direct interaction)|is_directed:true|is_direct:true"

                    pubmed_ids = ['27087310']  # NPInter publication
                    pubmed_id = columns[11].strip()
                    if len(pubmed_id) > 0 and re.search("^\\d+$", pubmed_id):
                        pubmed_ids.append(pubmed_id)
                    pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids))

                    edge_dict = {
                        'publication_ids': "|".join(pubmed_ids),
                        'layer': '7',
                        'source_db': 'NPInter',
                        'interaction_identifiers': None,
                        'confidence_scores': None,
                        'interaction_detection_method': None,
                        'interaction_types': interaction_types,
                        'first_author': None
                    }

                    db_api.insert_edge(source_dict, target_dict, edge_dict)
        print("processed lines: %d" % lines)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
    print("NPInter finished")
class MolecularIDMapper:
    def __init__(self, db, layer, PROT_DBname, LNCRNAMAP_DBname):
        """
        :param db: name of the parsed source database
        :param PROT_DBname: if value is not None, database is created in memory
        :argument DICTIONARY_DB_LOCATION: location of the mapping db, output of create_mapping_db
        :argument SQL_SEED_LOCATION
        :argument SOURCE_DB_LOCATION: location of the parsed source database
        :argument DESTINATION_DB_LOCATION: location where the mapped db will be saved
        """

        # Declaring, and assigning constants
        self.DICTIONARY_DB_LOCATION = PROT_DBname
        self.SQL_SEED_LOCATION = '../../SLKlib/SQLiteDBApi/network-db-seed.sql'
        self.SOURCE_DB_TYPE = db
        self.layer = layer
        # The db we want to map
        self.SOURCE_DB_LOCATION = 'all_output/' + db + '.db'
        # Saving location
        self.DESTINATION_DB_LOCATION = '../../SLKlib/mapper/protein/output/' + db + '_mapped.db'
        # Protein map db
        self.DICTIONARY_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION)
        self.DICTIONARY_DB_CURSOR = self.DICTIONARY_DB.cursor()
        # lncRNA map db
        if self.layer == 'lncRNA' or self.layer == 'miRNA':
            self.LNCRNAMAP_DB_LOCATION = LNCRNAMAP_DBname
            self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION)

        self.PROT_DBname = PROT_DBname
        if self.PROT_DBname is not None:
            # Read database to tempfile
            self.con = sqlite3.connect(self.PROT_DBname)
            tempfile = io.StringIO()
            for line in self.con.iterdump():
                tempfile.write('%s\n' % line)
            self.con.close()
            tempfile.seek(0)

            # Create a database in memory and import from tempfile
            self.PROT_DB = sqlite3.connect(":memory:")
            with self.PROT_DB:
                self.PROT_DB.cursor().executescript(tempfile.read())
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX map_uniprot ON MAPP(uniprot_ac);")
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX uniprotac_id ON UNIPROT_AC(id);")
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX taxid ON SPECIES(tax_id);")
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX map_foreign ON MAPP(foreign_id);")
        else:
            self.PROT_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION)
            self.PROT_DB.cursor().execute(
                "CREATE INDEX index_name ON mapp (foreign_id);")

        # For lncRNA and miRNA
        if self.layer == 'lncRNA' or self.layer == 'miRNA':
            self.LNCRNAMAP_DBname = LNCRNAMAP_DBname
            if self.LNCRNAMAP_DBname is not None:
                # Read database to tempfile
                self.con = sqlite3.connect(self.LNCRNAMAP_DBname)
                tempfile = io.StringIO()
                for line in self.con.iterdump():
                    tempfile.write('%s\n' % line)
                self.con.close()
                tempfile.seek(0)

                # Create a database in memory and import from tempfile
                self.LNCRNAMAP_DB = sqlite3.connect(":memory:")
                with self.LNCRNAMAP_DB:
                    self.LNCRNAMAP_DB.cursor().executescript(tempfile.read())
                    self.LNCRNAMAP_DB.cursor().execute(
                        "CREATE INDEX index_name ON mapper (orig_ac);")
            else:
                self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION)

        self.new_db = PsimiSQL(self.SQL_SEED_LOCATION)
        # iterating through the old_db's nodes
        self.source_db = sqlite3.connect(self.SOURCE_DB_LOCATION)
        self.source_db.row_factory = sqlite3.Row
        self.cur = self.source_db.cursor()

    def add_node(self, old_node_id, old_to_new_node_ids_dict, new_name,
                 new_taxid, new_pathways, new_topo, new_db_api):
        """
        :param old_node_id: node id from the source db's node table
        :param old_to_new_node_ids_dict: A dictionary that contains an old node id as key and new node ids as values
        :param new_name: mapped uniprot ac of the mapped node
        :param new_taxid: taxid
        :param new_pathways: pathway
        :param new_topo: topology
        :param new_db_api: A PsimiSQL object
        """

        new_node_dict = {
            "name": new_name,
            "alt_accession": None,  # we don't use it anymore
            "tax_id": new_taxid,
            "pathways": new_pathways,
            "aliases": None,  # we don't use it anymore
            "topology": new_topo
        }

        # inserting the node to the PSI-MI SQLite
        new_db_api.insert_unique_node(new_node_dict)

        # getting the new last row id of the inserted node
        new_node_id = new_node_dict['id']

        # if the node maps to more than one swissprot uniprot id it will be inserted for every swissprot id and
        # this function will be called for every insertion
        if old_node_id not in old_to_new_node_ids_dict:
            old_to_new_node_ids_dict[old_node_id] = new_node_id

    def main(self):
        old_node_ids_dict = {}
        invalid_edge_counter = 0

        # MAPPING NODES
        self.cur.execute("SELECT * FROM node")
        node_counter = 0
        while True:
            # Getting data for each node
            node_row = self.cur.fetchone()
            node_counter += 1
            # Until the last row
            if node_row is None:
                break

            # Getting the old information into a dictionary
            old_node_dict = dict(node_row)

            # For all other databases
            foreign_id = old_node_dict['name'].split(':')[1].strip()
            # Taxid
            taxid = old_node_dict['tax_id'].split(':')[1].split('(')[0]

            # miRNA and lncRNA mapping
            if self.layer == 'lncRNA' or self.layer == 'miRNA':
                with self.LNCRNAMAP_DB:
                    c = self.LNCRNAMAP_DB.cursor()
                    for indiv_id in foreign_id.split(','):
                        indiv_id = indiv_id.replace('"', '').lower()
                        c.execute(
                            '''SELECT mapped_ac FROM MAPPER WHERE '%s' = MAPPER.orig_ac GROUP BY MAPPER.orig_ac'''
                            % indiv_id)
                        firstrow = c.fetchone()
                        if firstrow:
                            m.add_node(node_row['id'], old_node_ids_dict,
                                       'RNACentral:' + firstrow[0],
                                       node_row['tax_id'],
                                       node_row['pathways'],
                                       node_row['topology'], self.new_db)
                with self.PROT_DB:
                    c2 = self.PROT_DB.cursor()
                    foreign_id = foreign_id.split(".")[0]
                    c2.execute(
                        "SELECT UNIPROT_AC.uniprot_ac, UNIPROT_AC.uniprot_ac_alt_acc FROM UNIPROT_AC "
                        "JOIN MAPP ON MAPP.uniprot_ac=UNIPROT_AC.id "
                        "JOIN SPECIES ON SPECIES.id=UNIPROT_AC.taxon WHERE SPECIES.tax_id='%s'"
                        "AND MAPP.foreign_id='%s' GROUP BY MAPP.foreign_id" %
                        (taxid, foreign_id.lower()))
                    firstrow = c2.fetchone()
                    if firstrow:
                        m.add_node(node_row['id'], old_node_ids_dict,
                                   'Uniprot:' + firstrow[0],
                                   node_row['tax_id'], node_row['pathways'],
                                   node_row['topology'], self.new_db)

            # Protein mapping
            else:
                with self.PROT_DB:
                    c = self.PROT_DB.cursor()
                    # Getting uniprot acs for each node and adding the node with new data to the new database
                    foreign_id = foreign_id.split(".")[0]
                    c.execute(
                        "SELECT UNIPROT_AC.uniprot_ac, UNIPROT_AC.uniprot_ac_alt_acc FROM UNIPROT_AC "
                        "JOIN MAPP ON MAPP.uniprot_ac=UNIPROT_AC.id "
                        "JOIN SPECIES ON SPECIES.id=UNIPROT_AC.taxon WHERE SPECIES.tax_id='%s'"
                        "AND MAPP.foreign_id='%s' GROUP BY MAPP.foreign_id" %
                        (taxid, foreign_id.lower()))
                    firstrow = c.fetchone()
                    if firstrow:
                        m.add_node(node_row['id'], old_node_ids_dict,
                                   'Uniprot:' + firstrow[0],
                                   node_row['tax_id'], node_row['pathways'],
                                   node_row['topology'], self.new_db)

        # MAPPING EDGES

        # Since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict
        # if both nodes mapped we add them as an edge to the new db

        self.cur.execute("SELECT * from EDGE")
        edge_counter = 0
        while True:
            edge_row = self.cur.fetchone()
            if edge_row is None:
                break
            else:
                edge_counter += 1
                if edge_row[
                        'interactor_a_node_id'] in old_node_ids_dict and edge_row[
                            'interactor_b_node_id'] in old_node_ids_dict:
                    new_node_id_a = old_node_ids_dict[
                        edge_row['interactor_a_node_id']]
                    new_node_id_b = old_node_ids_dict[
                        edge_row['interactor_b_node_id']]
                    new_node_a_dict = self.new_db.get_node_by_id(new_node_id_a)
                    new_node_b_dict = self.new_db.get_node_by_id(new_node_id_b)

                    new_edge_dict = dict(edge_row)
                    new_edge_dict['interactor_a_node_id'] = new_node_id_a
                    new_edge_dict['interactor_b_node_id'] = new_node_id_b
                    new_edge_dict['source_db'] = edge_row['source_db']

                    # inserting the new node
                    self.new_db.insert_edge(new_node_a_dict, new_node_b_dict,
                                            new_edge_dict)
                else:
                    invalid_edge_counter += 1

        # Saving the mapped database
        self.new_db.save_db_to_file(self.DESTINATION_DB_LOCATION)
        print(
            "\nmapping finished for: %s  total edges: %d (unable to map: %d)\n"
            % (self.SOURCE_DB_TYPE, edge_counter, invalid_edge_counter))

        import slk3_db_validator
        valid = slk3_db_validator.validate_db_file(
            self.DESTINATION_DB_LOCATION)
        if not valid:
            print("ERROR! invalid db file created by the mapper: " +
                  self.DESTINATION_DB_LOCATION)
            sys.exit(1)

        return self.SOURCE_DB_TYPE, edge_counter, invalid_edge_counter
Example #11
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    FILE = csv.reader(open(FILE_LOCATION), delimiter=';')

    # Skipping the header
    next(FILE)

    node_names_to_id = {}

    for row in FILE:
        mitab_source_pathways = get_mitab_pathways_list_string(row[5])
        mitab_target_pathways = get_mitab_pathways_list_string(row[11])

        # Creating the node dicts, if the node is already in the db assigning that to the node dict
        source_dict = insert_or_get_node_dict(row[1], row[0],
                                              mitab_source_pathways,
                                              row[4].strip(), row[3],
                                              node_names_to_id, db_api)
        target_dict = insert_or_get_node_dict(row[7], row[6],
                                              mitab_target_pathways,
                                              row[10].strip(), row[9],
                                              node_names_to_id, db_api)

        effect = EFFECT_MAP[row[15]]

        is_direct = IS_DIRECT_MAP[row[14].lower()]
        if "MI:0407(direct interaction)" in is_direct:
            is_direct = "true"
        else:
            is_direct = "false"

        is_directed = IS_DIRECTED_MAP[row[13].lower()]
        if is_directed == "directed":
            is_directed = "true"
        else:
            is_directed = "false"

        # Setting up the interaction type
        interaction_types = "%s|is_directed:%s|is_direct:%s" \
                            % (effect, is_directed, is_direct)

        new_scores = []

        if row[18] != '':
            scores = row[18].split(",")
            for s in scores:
                confidence_score_name = s.split(":")[0]
                if " " in confidence_score_name:
                    confidence_score_name = confidence_score_name.replace(
                        " ", "")
                confidence_score_value = s.split(":")[1]
                if " " in confidence_score_value:
                    confidence_score_value = confidence_score_value.replace(
                        " ", "")
                score = f'SLK2 {confidence_score_name}:{confidence_score_value}'
                if score not in new_scores:
                    new_scores.append(score)

        edge_dict = {
            'interaction_detection_method': None,
            'first_author': None,
            'publication_ids': get_mitab_publication_list_string(row[16]),
            'interaction_types': interaction_types,
            'source_db': "SLKv2.0",
            'interaction_identifiers': None,
            'confidence_scores': "|".join(new_scores),
            'layer': "8"
        }

        db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
Example #12
0
def main(logger):
    # Initiating a PsimiSQL class
    db_api = PsimiSQL(SQL_SEED)

    # Making the script user friendly
    file_counter = 1
    print("Started parsing .csv files")

    # Parsing data files
    for csv_file_location in CSV_LIST:

        csv_file_name = csv_file_location.split('/')[-1]

        sys.stdout.write("Parsing '%s' (%d/%d)\r" %
                         (csv_file_name, file_counter, NUMBER_OF_FILES))

        csv_file = csv.reader(open(csv_file_location, encoding="ISO-8859-1"),
                              delimiter=';',
                              quotechar='"')

        pathway = FILENAME_TO_PATHWAY_MAP[csv_file_name]

        # Skipping the header
        for cells in csv_file:

            type_a = cells[1].lower()
            type_b = cells[5].lower()

            taxids = cells[12].split(';')[0]

            if type_a == 'protein' and type_b == 'protein' and taxids == '9606':

                # Dealing with the first node

                node_a_name = f'Uniprot:{cells[2]}'
                node_a_taxid = 'taxid:' + taxids
                node_a_taxid = node_a_taxid

                node_a_dict = {}

                # If the node already exists in the db, than only it's pathway will be modified, otherwise it will be added to the db
                if db_api.get_node(node_a_name, node_a_taxid):
                    node_a_dict = db_api.get_node(node_a_name, node_a_taxid)
                    if not pathway in node_a_dict['pathways']:
                        node_a_dict['pathways'] += '|' + pathway
                        db_api.update_node(node_a_dict)
                else:
                    node_a_dict = {
                        'name': node_a_name,
                        'alt_accession': 'entrez gene/locuslink:' + cells[0],
                        'tax_id': node_a_taxid,
                        'pathways': pathway,
                        'aliases': None,
                        'topology': ""
                    }
                    db_api.insert_node(node_a_dict)

                # Doing the same with node b

                node_b_name = f'Uniprot:{cells[2]}'
                node_b_taxid = 'taxid:' + taxids
                node_b_taxid = node_b_taxid

                node_b_dict = {}

                # If the node already exists in the db, than only it's pathway will be modified, otherwise it will be added to the db

                if db_api.get_node(node_b_name, node_b_taxid):
                    node_b_dict = db_api.get_node(node_b_name, node_b_taxid)
                    if not pathway in node_b_dict['pathways']:
                        node_b_dict['pathways'] += '|' + pathway
                        db_api.update_node(node_b_dict)
                else:
                    node_b_dict = {
                        'name': node_b_name,
                        'alt_accession': 'entrez gene/locuslink:' + cells[4],
                        'tax_id': node_b_taxid,
                        'pathways': pathway,
                        'aliases': None,
                        'topology': ""
                    }
                    db_api.insert_node(node_b_dict)

                # Getting publication id
                publication_id = ['pubmed:' + cells[21]]
                publication_id.append("pubmed:26467481")

                effect = EFFECT_MAP[cells[8]]

                molecular_background = MOLECULAR_MAP[cells[9]]

                inttype_final = effect + '|' + molecular_background

                is_direct = IS_DIRECT_MAP[cells[22]].strip()
                if "MI:0407(direct interaction)" in is_direct:
                    is_direct = "true"
                else:
                    is_direct = "false"

                # Setting up the interaction type
                interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                    % (inttype_final, "true", is_direct)

                edge_dict = {
                    'interaction_detection_method': None,
                    'first_author': None,
                    'publication_ids': "|".join(publication_id),
                    'interaction_types': interaction_types,
                    'source_db': 'Signor',
                    'interaction_identifiers': None,
                    'confidence_scores': None,
                    'layer': "8"
                }

                db_api.insert_edge(node_a_dict, node_b_dict, edge_dict)

    print("Parsing files finished!")
    print("Finished parsing Signor. Saving db to %s.db" % (DB_TYPE))
    db_api.save_db_to_file(DB_DESTINATION)
Example #13
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    with open(DATA_FILE) as data:

        # Skipping the header
        data.readline()

        for line in data:
            columns = line.strip().split(';')
            taxid = 'taxid:9606'

            # Creating the node dicts, if the node is already in the db assigning that to the node dict
            source_dict = get_node_a(columns[1], taxid, '|'.join(columns[4].replace('d ', 'd').split(',')), db_api)
            target_dict = get_node_b(columns[7], taxid, '|'.join(columns[10].replace('d ', 'd').split(',')), db_api)

            # Nodes are inserted to the db if they are not in it yet
            if not 'id' in source_dict:
                db_api.insert_node(source_dict)

            if not 'id' in target_dict:
                db_api.insert_node(target_dict)

            # Pubmed references
            pub_id = '|pubmed:'.join(columns[16].split('|'))

            # Directedness
            if columns[14] == 'direct':
                isdirect = 'true'
            else:
                isdirect = 'false'

            if columns[13] == 'PPI directed':
                isdirected = 'true'
            else:
                isdirected = 'false'

            # Effect
            if columns[15] == 'stimulation':
                effect = 'MI:0624(stimulation)'
                interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                    % (effect, isdirected, isdirect)
            elif columns[15] == 'inhibition':
                effect = 'MI:0623(inhibition)'

                interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                    % (effect, isdirected, isdirect)
            else:
                interaction_types = "is_directed:%s|is_direct:%s" \
                                    % (isdirected, isdirect)

            edge_dict = {
                'publication_ids': 'pubmed:' + pub_id,
                'layer': '1',
                'source_db': 'SLKv2.0',
                'interaction_identifiers': None,
                'confidence_scores': None,
                'interaction_detection_method': None,
                'interaction_types': interaction_types,
                'first_author': None
            }

            db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #14
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    for file in RAW_FILE_LIST:
        with open(file) as data:
            # Skipping the header
            data.readline()
            node_names_to_id = {}
            lines = 0

            for line in data:
                if line.strip() != '' and not line.strip().startswith("#"):
                    lines += 1
                    if lines % 50000 == 0:
                        print("processed lines (biogrid): %d" % lines)

                    columns = line.split('\t')

                    # if a cell contains only '-', we replace it with empty string
                    columns = list(
                        map(lambda x: "" if x.strip() == "-" else x, columns))

                    tax_id_a = columns[9].strip().lower()
                    tax_id_b = columns[10].strip().lower()

                    if tax_id_a in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955') and \
                       tax_id_b in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955'):

                        biogrid_ids_a = filter(
                            lambda x: x.strip().lower().startswith("biogrid:"),
                            columns[2].split("|"))
                        biogrid_id_a = list(biogrid_ids_a)[0][8:]

                        biogrid_ids_b = filter(
                            lambda x: x.strip().lower().startswith("biogrid:"),
                            columns[3].split("|"))
                        biogrid_id_b = list(biogrid_ids_b)[0][8:]

                        # Creating the node dicts, if the node is already in the db assigning that to the node dict
                        source_dict = insert_or_get_node_dict(
                            biogrid_id_a, 'BioGrid', tax_id_a,
                            node_names_to_id, db_api)
                        target_dict = insert_or_get_node_dict(
                            biogrid_id_b, 'BioGrid', tax_id_b,
                            node_names_to_id, db_api)

                        # interaction types in biogrid:
                        # direct:
                        #    - psi-mi:"MI:0407"(direct interaction)
                        #    - psi-mi:"MI:0915"(physical association)
                        #    - psi-mi:"MI:0914"(association)
                        # indirect:
                        #    - psi-mi:"MI:0799"(additive genetic interaction defined by inequality)
                        #    - psi-mi:"MI:0403"(colocalization)
                        #    - psi-mi:"MI:0796"(suppressive genetic interaction defined by inequality)
                        #    - psi-mi:"MI:0794"(synthetic genetic interaction defined by inequality)
                        mi_number = columns[11][11:15]
                        if mi_number not in ("0407", "0915", "0914", "0799",
                                             "0403", "0796", "0794"):
                            print("warning: unknown interaction type: " +
                                  columns[11])
                        is_direct = True
                        if mi_number in ("0799", "0403", "0796", "0794"):
                            is_direct = False

                        # we add the MI term to the interaction_types
                        # but we skip MI:0407(direct interaction) -> this info is already presented in the is_direct attribute
                        output_mi_string = "|" + columns[11].replace(
                            "psi-mi:", "").replace("\"", "")
                        if "MI:0407" in output_mi_string:
                            output_mi_string = ""

                        interaction_types = "is_directed:false|is_direct:%s%s" % (
                            str(is_direct).lower(), output_mi_string)

                        # Interaction detection methods: psi-mi:"MI:0018"(two hybrid)
                        detection_methods = columns[6].split("|")
                        detection_methods = map(
                            lambda x: x[7:]
                            if x.lower().startswith('psi-mi') else x,
                            detection_methods)
                        detection_methods = map(lambda x: x.replace("\"", ""),
                                                detection_methods)

                        # pubmed ids: pubmed:10747089
                        pubmed_ids = columns[8].split("|")
                        pubmed_ids = map(
                            lambda x: x[7:]
                            if x.lower().startswith('pubmed') else x,
                            pubmed_ids)
                        pubmed_ids = filter(lambda x: re.search("^\\d+$", x),
                                            pubmed_ids)
                        pubmed_ids = set(pubmed_ids)
                        pubmed_ids.add(
                            "30476227")  # latest biogrid publication
                        pubmed_ids = map(lambda x: "pubmed:" + x, pubmed_ids)

                        edge_dict = {
                            'publication_ids':
                            "|".join(pubmed_ids),
                            'layer':
                            '3',
                            'source_db':
                            'TheBiogrid',
                            'interaction_identifiers':
                            None,
                            'confidence_scores':
                            None,
                            'interaction_detection_method':
                            "|".join(detection_methods),
                            'interaction_types':
                            interaction_types,
                            'first_author':
                            None
                        }

                        db_api.insert_edge(source_dict, target_dict, edge_dict)

            print("processed lines: %d" % lines)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #15
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    with open(DATA_FILE) as data:
        # Skipping the header
        data.readline()
        node_names_to_id = {}

        for line in data:
            columns = line.split('\t')

            if columns[1] in SPECIES_DICT and columns[5] in SPECIES_DICT:

                # there are a few kinds of malformed miRbase IDs, like:
                #   [has-let-7a3b]    -->   hsa-let-7a3b
                #   hsa-miR-34b*      -->   hsa-miR-34b
                #   miR-143           -->   <tax_id>-miR-143
                #
                rna_id = columns[6]
                rna_id = rna_id.replace("[", "").replace("]", "").replace(
                    "has-", "hsa-")

                if not rna_id.startswith("hsa-") and not rna_id.startswith(
                        "dme-") and not rna_id.startswith(
                            "cel-") and not rna_id.startswith("dre-"):
                    if rna_id.startswith("miR-"):
                        rna_id = SPECIES_DICT[
                            columns[5]]['id_prefix'] + "-" + rna_id
                    else:
                        print(
                            "WARNING: skipping interaction due to malformed miRBase ID: "
                            + rna_id)
                        continue
                rna_id = rna_id.replace("*", "")

                source_dict = insert_or_get_node_dict(
                    rna_id, 'miRBase',
                    'taxid:' + SPECIES_DICT[columns[5]]['tax_id'],
                    node_names_to_id, db_api)
                target_dict = insert_or_get_node_dict(
                    columns[3], 'RefSeq',
                    'taxid:' + SPECIES_DICT[columns[1]]['tax_id'],
                    node_names_to_id, db_api)

                interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)"

                # pubmed id example: 15105502.0
                pubmed_id = columns[0].split('.')[0].strip()
                pubmed_ids = ['18996891']  # miRecords publication
                if len(pubmed_id) > 0:
                    pubmed_ids.append(pubmed_id)
                pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids))

                edge_dict = {
                    'publication_ids': "|".join(pubmed_ids),
                    'layer': '5',
                    'source_db': 'miRecords',
                    'interaction_identifiers': None,
                    'confidence_scores': None,
                    'interaction_detection_method': None,
                    'interaction_types': interaction_types,
                    'first_author': None
                }

                db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
    print("miRecords finished")
Example #16
0
def main(logger):
    db_api = PsimiSQL(SQL_SEED)

    # looping through SLK3 files
    for SLK_3_FILE_LOCATION in TSV_LIST:

        #opening each slk files and looping through it
        SLK_3_FILE = csv.reader(open(SLK_3_FILE_LOCATION,
                                     encoding="ISO-8859-1"),
                                delimiter='\t',
                                quotechar='"')
        next(SLK_3_FILE)  # Skipping the header

        for line in SLK_3_FILE:

            pathways_a = get_mitab_pathways_list(line[4])
            new_pathways_a = []
            for p in pathways_a:
                pathway_a = p
                if " " in p:
                    pathway_a = p.replace(" ", "")
                elif '"' in p:
                    pathway_a = p.replace('"', "")
                new_p = accepted_pathways[pathway_a]
                new_pathways_a.append(new_p)

            new_node_a = line[3]
            if " " in line[3]:
                new_node_a = line[3].replace(" ", "")

            if line[5] == "Meditor":
                line[5] = "Mediator"

            topologies_a = set(map(lambda x: x.strip(), line[5].split(",")))

            source_dict = {
                "name": "Uniprot:" + new_node_a,
                "alt_accession": "gene symbol:" + line[2],
                "tax_id": ORGANISM_NAME_TO_MITAB_ID_MAP[line[0]],
                "aliases": '-',
                "pathways": "|".join(new_pathways_a),
                "topology": "|".join(topologies_a)
            }

            db_api.insert_node(source_dict)

            pathways_b = get_mitab_pathways_list(line[9])
            new_pathways_b = []
            for p in pathways_b:
                pathway_b = p
                if " " in p:
                    pathway_b = p.replace(" ", "")
                elif '"' in p:
                    pathway_b = p.replace('"', "")
                new_p = accepted_pathways[pathway_b]
                new_pathways_b.append(new_p)

            new_node_b = line[8]
            if " " in line[8]:
                new_node_b = line[8].replace(" ", "")

            topologies_b = set(map(lambda x: x.strip(), line[10].split(",")))

            target_dict = {
                "name": "Uniprot:" + new_node_b,
                "alt_accession": "gene symbol:" + line[7],
                "tax_id": ORGANISM_NAME_TO_MITAB_ID_MAP[line[0]],
                "aliases": '-',
                "pathways": "|".join(new_pathways_b),
                "topology": "|".join(topologies_b)
            }

            db_api.insert_node(target_dict)

            effect = EFFECT_MAP[line[14].lower()]

            molecular_background = MOLECULAR_MAP[line[13].lower()]

            inttype_final = effect + '|' + molecular_background

            is_direct = IS_DIRECT_MAP[line[12].lower()]
            if "MI:0407(direct interaction)" in is_direct:
                is_direct = "true"
            else:
                is_direct = "false"

            interaction_types = "%s|is_directed:%s|is_direct:%s" % (
                inttype_final, "true", is_direct)

            edge_dict = {
                'interaction_detection_method': None,
                'first_author': None,
                'publication_ids': 'pubmed:' + line[15],
                'interaction_types': interaction_types,
                'source_db': 'SLKv3.0',
                'interaction_identifiers': None,
                'confidence_scores': None,
                'layer': "8"
            }

            db_api.insert_edge(source_dict, target_dict, edge_dict)

    db_api.save_db_to_file(DESTINATION)
Example #17
0
def main(logger):
    # Making a set from the curated files, the set will contain the proteins that does map to a unique value
    # (has 1 ->* mapping)
    not_valid_node_set = set()
    valid_node_set = set()

    # Getting all nodes that contain a * character, that means that the node represents more than one molecule
    # Making a list from these not unique proteins
    with open(CURATED_PROTEIN_LIST_FILE_LOCATION) as curated_protein_list_file:
        for line in curated_protein_list_file:
            line = line.strip()
            cells = line.split('\t')
            if len(cells) > 4:
                not_valid_node_set.add(cells[0])
            else:
                # Collecting protein nodes
                valid_node_set.add(cells[0])

    # Collecting pathways from the pathway files
    PATHWAY_FILE = PATHWAY_FILE_LOCATION
    pathways = get_pathways(open(PATHWAY_FILE))

    # Initialising a PsimiTOSQL object
    parser = PsimiSQL(SQL_SEED)

    # Generating a dictionary that holds unique node objects, in the same time the node sql table is filled up
    nodes = {}
    edges = {}

    with open(CURATED_PROTEIN_LIST_FILE_LOCATION) as PPI_FILE:

        PPI_FILE.readline()

        # Although this is a SIF formatted files, it only contains two interactors in a line
        # (a SIF files can contain more than 2 interactors in a line)
        for line in PPI_FILE:
            # getting the names of interacting genes in HUGO format
            cells = line.strip().split('\t')

            try:

                inttype = cells[1]

                gene_a = cells[0]
                gene_b = cells[2]

                pubmed_ids = cells[3]

            except IndexError:
                continue

            if (gene_a not in valid_node_set) or (gene_b
                                                  not in valid_node_set):
                continue

            if pubmed_ids:
                pubmed_list = pubmed_ids.split(';')
                pubmed_list.append("26192618")
                if 'N/A' in pubmed_list:
                    pubmed_list.remove('N/A')
                pubmed_ids = 'pubmed:' + '|pubmed:'.join(pubmed_list)

                edge_id = gene_a + '@' + gene_b

                for type in inttype.lower().split(';'):
                    final_inttype = []
                    if 'association' in type:
                        selected_type = 'MI:0914(association)'
                        final_inttype.append(selected_type)
                    else:
                        selected_type = 'MI:0190(interaction type)'
                        final_inttype.append(selected_type)

                    if edge_id not in edges:
                        edges[edge_id] = {
                            'inserted': False,
                            'is_complex': None,
                            'pubmed': pubmed_ids,
                            'effect': '|'.join(final_inttype)
                        }
            else:
                continue

    with open(CURATED_PROTEIN_LIST_FILE_LOCATION) as PPI_FILE:

        PPI_FILE.readline()

        for line in PPI_FILE:
            # Resetting variables
            edge_id = None
            gene_a = None
            gene_b = None
            effect = None
            edge_id = None

            try:
                cells = line.split('\t')

                gene_a = cells[0]
                gene_b = cells[2]

            except IndexError:
                continue

            not_accepted_characters = [" ", "?", "~", ","]
            characters_in_gene_a = [
                e for e in not_accepted_characters if e in gene_a
            ]
            if len(characters_in_gene_a) > 0:
                continue
            characters_in_gene_b = [
                e for e in not_accepted_characters if e in gene_b
            ]
            if len(characters_in_gene_b) > 0:
                continue

            if (gene_a not in valid_node_set) or (gene_b
                                                  not in valid_node_set):
                continue

            edge_id = gene_a + '@' + gene_b

            if edge_id in edges:
                if edges[edge_id]['is_complex'] is True or edges[edge_id][
                        'inserted'] is True or "Reference" in edges[edge_id][
                            'effect'] or "neighbor-of" in edges[edge_id][
                                'effect']:
                    continue
                else:
                    pubmed_ids = edges[edge_id]['pubmed']
                    effect = edges[edge_id]['effect']
            else:
                continue
            """ creating and inserting edges to the db """

            gene_a_pathway_list = get_pathway_list(gene_a.replace('*', ''),
                                                   pathways)

            gene_b_pathway_list = get_pathway_list(gene_b.replace('*', ''),
                                                   pathways)

            # If the node is in the not_valid_node set, it is not inserted
            if gene_a not in not_valid_node_set:
                gene_a = gene_a.replace('*', '')
                if gene_a in nodes:
                    interactor_a = nodes[gene_a]
                else:
                    interactor_a = nodes[gene_a] = {
                        'name': 'HGNC:' + gene_a,
                        'alt_accession': 'HGNC:' + gene_a,
                        'tax_id': 'taxid:9606',
                        'pathways': '|'.join(gene_a_pathway_list),
                        'aliases': None
                    }
                    parser.insert_node(interactor_a)
            else:
                continue

            if gene_b not in not_valid_node_set:
                gene_b = gene_b.replace('*', '')
                if gene_b in nodes:
                    interactor_b = nodes[gene_b]
                else:
                    interactor_b = nodes[gene_b] = {
                        'name': 'HGNC:' + gene_b,
                        'alt_accession': 'HGNC:' + gene_b,
                        'tax_id': 'taxid:9606',
                        'pathways': '|'.join(gene_b_pathway_list),
                        'aliases': None
                    }
                    parser.insert_node(interactor_b)
            else:
                continue

            interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                % (effect, "true", "false")

            edge_dict = {
                'interaction_detection_method': None,
                'first_author': None,
                'publication_ids': pubmed_ids,
                'interaction_types': interaction_types,
                'source_db': DB_TYPE,
                'interaction_identifiers': None,
                'confidence_scores': None,
                'layer': "8"
            }

            parser.insert_edge(interactor_a, interactor_b, edge_dict)

            edges[edge_id]['inserted'] = True

    parser.save_db_to_file(EXPORT_DB_LOCATION)
Example #18
0
def main(logger):

    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    node_names_to_id = {}

    for file in DATA_FILE_LIST:
        print("processing input file: " + file)
        with open(file) as data:
            lines = 0
            for line in data:
                columns = line.split('\t')

                if columns[2] in SPECIES_DICT and columns[5] in SPECIES_DICT:
                    lines += 1
                    if lines % 50000 == 0:
                        print("processed lines (TarBase): %d" % lines)

                    # there are a few malformed miRbase IDs, like:    hsa-let-7c*    -->   hsa-let-7c
                    # also during the mapping we dont care about the 3p/5p postfix
                    rna_id = columns[1]
                    rna_id = rna_id.replace("*", "")
                    if rna_id.endswith("-3p") or rna_id.endswith("-5p"):
                        rna_id = rna_id[:-3]

                    # Entrez Gene ID in the input file: 3091.0
                    gene_id = columns[4].split('.')[0].strip()

                    source_dict = insert_or_get_node_dict(
                        rna_id, 'miRBase',
                        'taxid:' + SPECIES_DICT[columns[2]]['tax_id'],
                        node_names_to_id, db_api)
                    target_dict = insert_or_get_node_dict(
                        gene_id, 'GeneID',
                        'taxid:' + SPECIES_DICT[columns[5]]['tax_id'],
                        node_names_to_id, db_api)

                    interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)"

                    # pubmed id example: 16921378.0
                    pubmed_id = columns[8].split('.')[0].strip()
                    pubmed_ids = ['25416803']  # TarBase v7 publication
                    if len(pubmed_id) > 0:
                        pubmed_ids.append(pubmed_id)
                    pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids))

                    detmap = {
                        'qRT-PCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'Luciferase reporter assay':
                        'MI:2285(miRNA interference luciferase reporter assay)',
                        'Western blot':
                        'MI:0113(western blot)',
                        'GFP reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'In situ hybridization':
                        'MI:0045(experimental interaction detection)',
                        'Northern blot':
                        'MI:0929(northern blot)',
                        'Reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'Other':
                        'MI:0045(experimental interaction detection)',
                        'Microarray':
                        'MI:0008(array technology)',
                        'Immunohistochemistry':
                        'MI:1198(immunohistochemistry)',
                        'Immunocytochemistry':
                        'MI:1200(immunocytochemistry)',
                        'Immunoblot':
                        'MI:0045(experimental interaction detection)',
                        '5RACE':
                        'MI:0045(experimental interaction detection)',
                        'phenotypic sensor assay':
                        'MI:0045(experimental interaction detection)',
                        'real-time RT-PCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'in situ hybridization':
                        'MI:0045(experimental interaction detection)',
                        'FACS':
                        'MI:0045(experimental interaction detection)',
                        'ELISA':
                        'MI:0045(experimental interaction detection)',
                        'Flow':
                        'MI:0045(experimental interaction detection)',
                        'ChIP-seq':
                        'MI:0402(chromatin immunoprecipitation assay)',
                        'Immunofluorescence':
                        'MI:0045(experimental interaction detection)',
                        'GFP Reporter Assay':
                        'MI:0045(experimental interaction detection)',
                        'HITS-CLIP':
                        'MI:2191(clip)',
                        'PAR-CLIP':
                        'MI:2191(clip)',
                        'intrarenal expression':
                        'MI:0045(experimental interaction detection)',
                        'Proteomics':
                        'MI:0045(experimental interaction detection)',
                        'ChIP immunoprecipitation':
                        'MI:0402(chromatin immunoprecipitation assay)',
                        'Luciferase assay':
                        'MI:2285(miRNA interference luciferase reporter assay)',
                        'QRTPCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'Next Generation Sequencing (NGS)':
                        'MI:0078(nucleotide sequence identification)',
                        'RNA-binding protein immunoprecipitation':
                        'MI:1017(rna immunoprecipitation)',
                        'immunohistochemistry':
                        'MI:1198(immunohistochemistry)',
                        'Sequencing':
                        'MI:0078(nucleotide sequence identification)',
                        'CLASH':
                        'MI:2195(clash)',
                        'immunoprecipitaion':
                        'MI:1017(rna immunoprecipitation)',
                        'Quantitative proteomic approach':
                        'MI:0045(experimental interaction detection)',
                        'ChIP':
                        'MI:0402(chromatin immunoprecipitation assay)',
                        'TRAP':
                        'MI:0045(experimental interaction detection)',
                        'Immunoprecipitaion':
                        'MI:1017(rna immunoprecipitation)',
                        'LacZ reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'flow':
                        'MI:0045(experimental interaction detection)',
                        'EMSA':
                        'MI:0045(experimental interaction detection)',
                        'Communoprecipitaion':
                        'MI:1017(rna immunoprecipitation)',
                        'pSILAC':
                        'MI:0045(experimental interaction detection)',
                        'RTPCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'proteomics analysis':
                        'MI:0045(experimental interaction detection)',
                        'immunoblot':
                        'MI:0045(experimental interaction detection)',
                        'ASO assay':
                        'MI:0045(experimental interaction detection)',
                        'semi-qRT-PCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'mice xenograft':
                        'MI:0045(experimental interaction detection)',
                        'Chip':
                        'MI:0402(chromatin immunoprecipitation assay)',
                        'Flow cytometry':
                        'MI:0045(experimental interaction detection)',
                        'Immuohistochemistry':
                        'MI:0045(experimental interaction detection)',
                        'Chromatin immunoprecipitation':
                        'MI:0402(chromatin immunoprecipitation assay)',
                        'microarray':
                        'MI:0008(array technology)',
                        'Western blotting':
                        'MI:0113(western blot)',
                        'TaqMan miRNA assay/RT-PCR':
                        'MI:0045(experimental interaction detection)|MI:1196(quantitative reverse transcription pcr)',
                        'TaqMan miRNA assay':
                        'MI:0045(experimental interaction detection)',
                        'QRTPCRWestern blot':
                        'MI:1196(quantitative reverse transcription pcr)|MI:0113(western blot)',
                        'Gluc assay':
                        'MI:0045(experimental interaction detection)',
                        'Real time PCR':
                        'MI:0045(experimental interaction detection)',
                        "3'LIFE":
                        'MI:0045(experimental interaction detection)',
                        'Annexin V-FITC':
                        'MI:0045(experimental interaction detection)',
                        "5\\'RACE":
                        'MI:0045(experimental interaction detection)',
                        'Real time RT-PCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'Luciferase assay/RT-PCR':
                        'MI:2285(miRNA interference luciferase reporter assay)|MI:1196(quantitative reverse transcription pcr)',
                        'Westren blot':
                        'MI:0113(western blot)',
                        '2DGE':
                        'MI:0045(experimental interaction detection)',
                        'Mass spectrometry':
                        'MI:0943(detection by mass spectrometry)',
                        'EGFP reporter assay':
                        'MI:0045(experimental interaction detection)',
                        ' Western blot':
                        'MI:0113(western blot)',
                        'AGO2 binding RNA immunoprecipitation qRT-PCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'B-globin reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'RISC-IP':
                        'MI:1017(rna immunoprecipitation)',
                        'Western Blotting':
                        'MI:0113(western blot)',
                        'Immunoprecipitation':
                        'MI:1017(rna immunoprecipitation)',
                        'GFP reporter':
                        'MI:0045(experimental interaction detection)',
                        'pMIR-REPORT':
                        'MI:0045(experimental interaction detection)',
                        'LacZ assay':
                        'MI:0045(experimental interaction detection)',
                        "5'RACE":
                        'MI:0045(experimental interaction detection)',
                        'Western blog':
                        'MI:0113(western blot)',
                        'Western blo':
                        'MI:0113(western blot)',
                        'western blot':
                        'MI:0113(western blot)',
                        'Reverse-phase protein array':
                        'MI:0008(array technology)',
                        'Western Blot':
                        'MI:0113(western blot)',
                        'MTT assay':
                        'MI:0045(experimental interaction detection)',
                        'Immunofluorescence staining':
                        'MI:0045(experimental interaction detection)',
                        'Immunoblotting':
                        'MI:0045(experimental interaction detection)',
                        'SILAC (Stable Isotope Labeling of Amino acids in Culture)':
                        'MI:0045(experimental interaction detection)',
                        'Western blot, luciferase assay':
                        'MI:0113(western blot)|MI:2285(miRNA interference luciferase reporter assay)',
                        'DNA methylation analysis':
                        'MI:1189(methylation interference assay)',
                        'Wetsern blot':
                        'MI:0113(western blot)',
                        'Immunohistochemistry analysis':
                        'MI:1198(immunohistochemistry)',
                        'ChIP-PCR':
                        'MI:0402(chromatin immunoprecipitation assay)',
                        'luciferase reporter assays':
                        'MI:2285(miRNA interference luciferase reporter assay)',
                        'PCR array':
                        'MI:0008(array technology)',
                        'Western':
                        'MI:0113(western blot)',
                        'immunostaining':
                        'MI:0422(immunostaining)',
                        'Caspase-Glo® 3/7 assay':
                        'MI:0045(experimental interaction detection)',
                        'Cell proliferation assay':
                        'MI:0045(experimental interaction detection)',
                        'safranin o staining/GAGs contents assay':
                        'MI:0045(experimental interaction detection)',
                        'wound healing assays':
                        'MI:0045(experimental interaction detection)',
                        'transwell insert':
                        'MI:0045(experimental interaction detection)',
                        'anoikis assay':
                        'MI:0045(experimental interaction detection)',
                        'Gluc reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'GUS reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'Zymography':
                        'MI:0512(zymography)',
                        'Motility assay':
                        'MI:0045(experimental interaction detection)',
                        'CAM assay':
                        'MI:0045(experimental interaction detection)',
                        'Colony formation assay':
                        'MI:0045(experimental interaction detection)',
                        'Alizarin red S staining':
                        'MI:0045(experimental interaction detection)',
                        'mRNA decay':
                        'MI:0045(experimental interaction detection)',
                        'Cell proliferation':
                        'MI:0045(experimental interaction detection)',
                        'apoptosis':
                        'MI:0045(experimental interaction detection)',
                        'cell cycle assays':
                        'MI:0045(experimental interaction detection)',
                        'colony formation':
                        'MI:0045(experimental interaction detection)',
                        'Immunoflourescence':
                        'MI:0045(experimental interaction detection)',
                        'Micorarray':
                        'MI:0008(array technology)',
                        'Westren Blot':
                        'MI:0113(western blot)',
                        'Luciferase reporter assay/Western blot':
                        'MI:2285(miRNA interference luciferase reporter assay)|MI:0113(western blot)',
                        'Immunohistochemical (IHC) staining':
                        'MI:1198(immunohistochemistry)',
                        'Luciferase reporter assay/qRT-PCR':
                        'MI:2285(miRNA interference luciferase reporter assay)|MI:1196(quantitative reverse transcription pcr)',
                        '5"RACE':
                        'MI:0045(experimental interaction detection)',
                        'Immunofluorescence analysis':
                        'MI:0045(experimental interaction detection)',
                        'luciferase reporter assay':
                        'MI:2285(miRNA interference luciferase reporter assay)',
                        'Wstern blot':
                        'MI:0113(western blot)',
                        'Coimmunoprecipitation':
                        'MI:1017(rna immunoprecipitation)',
                        'Immunofluorescence microscopy':
                        'MI:0045(experimental interaction detection)',
                        '/Western blot':
                        'MI:0113(western blot)',
                        'Luciferase reporter assay/QRTPCR':
                        'MI:2285(miRNA interference luciferase reporter assay)|MI:1196(quantitative reverse transcription pcr)',
                        'MTT':
                        'MI:0045(experimental interaction detection)',
                        'immunofluorescence assays':
                        'MI:0045(experimental interaction detection)',
                        'qRT_PCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        '2-D Gel Electrophoresis (2DGE)':
                        'MI:0982(electrophoretic mobility-based method)',
                        'RISC analysis':
                        'MI:0045(experimental interaction detection)',
                        'silico analysis':
                        'MI:0045(experimental interaction detection)',
                        'Microarray/In situ hybridization':
                        'MI:0008(array technology)',
                        'Western blot ':
                        'MI:0113(western blot)',
                        'Genotyping':
                        'MI:0045(experimental interaction detection)',
                        'Weastern blot':
                        'MI:0113(western blot)',
                        'YFP expression':
                        'MI:0045(experimental interaction detection)',
                        'To test if miR-141 directly targets the PR transcript, we analyzed four predicted miR-141-binding sites (Figure 4c)':
                        'MI:0045(experimental interaction detection)',
                        ' three within the 3′ untranslated region (UTR) as identified through Targetscan (http:':
                        'MI:0045(experimental interaction detection)',
                        'www.targetscan.org/) and one in the la':
                        'MI:0045(experimental interaction detection)',
                        'qRT-PCR/Luciferase reporter assay':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'Luciferase reporter assay and western blot':
                        'MI:2285(miRNA interference luciferase reporter assay)|MI:0113(western blot)',
                        'TOPflash/FOPflash reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'dual-luciferase reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'RNA immunoprecipitation assay (RIP)':
                        'MI:1017(rna immunoprecipitation)',
                        'Chromogenic in situ hybridization':
                        'MI:0045(experimental interaction detection)',
                        'Luciferase reporter assa':
                        'MI:2285(miRNA interference luciferase reporter assay)',
                        'Immunoprecipitaionă„ĄLuciferase reporter assay':
                        '|MI:2285(miRNA interference luciferase reporter assay)',
                        'ImmunoprecipitaionㄥLuciferase reporter assay':
                        '|MI:2285(miRNA interference luciferase reporter assay)',
                        'Luciferase reporter assay/Microarray':
                        'MI:2285(miRNA interference luciferase reporter assay)|MI:0008(array technology)',
                        'q-PCR':
                        'MI:1196(quantitative reverse transcription pcr)',
                        'AGO2 Immunoprecipitation':
                        'MI:1017(rna immunoprecipitation)',
                        'Cell proliferation assays':
                        'MI:0045(experimental interaction detection)',
                        'LC-MS/MS':
                        'MI:0943(detection by mass spectrometry)',
                        'Chromatin Immunoprecipitation':
                        'MI:0402(chromatin immunoprecipitation assay)',
                        'Co-immunoprecipitation':
                        'MI:1017(rna immunoprecipitation)',
                        'IlluminaExpressionArrays':
                        'MI:0008(array technology)',
                        'Protein Immunoblot Analyses':
                        'MI:0045(experimental interaction detection)',
                        'miR PCR array system':
                        'MI:0008(array technology)',
                        'mtt':
                        'MI:0045(experimental interaction detection)',
                        'RNA immunopercipitation':
                        'MI:1017(rna immunoprecipitation)',
                        'TOP/FOP luciferase assay':
                        'MI:2285(miRNA interference luciferase reporter assay)',
                        'miRNA-masking antisense ODN (miR-Mask) assay':
                        'MI:0045(experimental interaction detection)',
                        'enzyme-linked immunosorbent assay':
                        'MI:0045(experimental interaction detection)',
                        'Ago2-IP/IgG-IP':
                        'MI:1017(rna immunoprecipitation)',
                        'EGFR reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'immunoblot analysis':
                        'MI:0045(experimental interaction detection)',
                        'Immunohistochemical analysis':
                        'MI:1198(immunohistochemistry)',
                        'CC tissues and cells (C33A, HeLa, CaSki, SiHa, and ME-180)':
                        'MI:0045(experimental interaction detection)',
                        'Immuno-precipitation':
                        'MI:1017(rna immunoprecipitation)',
                        'Luciferase reporter assayMTT':
                        'MI:2285(miRNA interference luciferase reporter assay)',
                        'Immunostaining':
                        'MI:0422(immunostaining)',
                        'immunosorbent':
                        'MI:0411(enzyme linked immunosorbent assay)',
                        'Immunofluorescent Assay':
                        'MI:0045(experimental interaction detection)',
                        'YFP reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'CLIP-seq':
                        'MI:2191(clip)',
                        'RNAi':
                        'MI:0045(experimental interaction detection)',
                        'TOPflash/FOPflash reporter assay':
                        'MI:0045(experimental interaction detection)',
                        'Caspase-Glo® 3/7 assay':
                        'MI:0045(experimental interaction detection)',
                        '':
                        'MI:0045(experimental interaction detection)',
                    }

                    detlist = []
                    for method in columns[6].split('//'):
                        for real_method in method.split(';'):
                            if real_method not in detmap:
                                print(
                                    "WARNING: detection method not recognised: "
                                    + real_method)
                                detlist.append(
                                    'MI:0045(experimental interaction detection)'
                                )
                            else:
                                detlist.append(detmap[real_method])

                    edge_dict = {
                        'publication_ids': '|'.join(pubmed_ids),
                        'layer': '5',
                        'source_db': 'TarBase',
                        'interaction_identifiers': None,
                        'confidence_scores': None,
                        'interaction_detection_method': '|'.join(detlist),
                        'interaction_types': interaction_types,
                        'first_author': None
                    }

                    db_api.insert_edge(source_dict, target_dict, edge_dict)
            print("processed lines (TarBase): %d" % lines)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
    def __init__(self, db, layer, PROT_DBname, LNCRNAMAP_DBname):
        """
        :param db: name of the parsed source database
        :param PROT_DBname: if value is not None, database is created in memory
        :argument DICTIONARY_DB_LOCATION: location of the mapping db, output of create_mapping_db
        :argument SQL_SEED_LOCATION
        :argument SOURCE_DB_LOCATION: location of the parsed source database
        :argument DESTINATION_DB_LOCATION: location where the mapped db will be saved
        """

        # Declaring, and assigning constants
        self.DICTIONARY_DB_LOCATION = PROT_DBname
        self.SQL_SEED_LOCATION = '../../SLKlib/SQLiteDBApi/network-db-seed.sql'
        self.SOURCE_DB_TYPE = db
        self.layer = layer
        # The db we want to map
        self.SOURCE_DB_LOCATION = 'all_output/' + db + '.db'
        # Saving location
        self.DESTINATION_DB_LOCATION = '../../SLKlib/mapper/protein/output/' + db + '_mapped.db'
        # Protein map db
        self.DICTIONARY_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION)
        self.DICTIONARY_DB_CURSOR = self.DICTIONARY_DB.cursor()
        # lncRNA map db
        if self.layer == 'lncRNA' or self.layer == 'miRNA':
            self.LNCRNAMAP_DB_LOCATION = LNCRNAMAP_DBname
            self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION)

        self.PROT_DBname = PROT_DBname
        if self.PROT_DBname is not None:
            # Read database to tempfile
            self.con = sqlite3.connect(self.PROT_DBname)
            tempfile = io.StringIO()
            for line in self.con.iterdump():
                tempfile.write('%s\n' % line)
            self.con.close()
            tempfile.seek(0)

            # Create a database in memory and import from tempfile
            self.PROT_DB = sqlite3.connect(":memory:")
            with self.PROT_DB:
                self.PROT_DB.cursor().executescript(tempfile.read())
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX map_uniprot ON MAPP(uniprot_ac);")
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX uniprotac_id ON UNIPROT_AC(id);")
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX taxid ON SPECIES(tax_id);")
                self.PROT_DB.cursor().execute(
                    "CREATE INDEX map_foreign ON MAPP(foreign_id);")
        else:
            self.PROT_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION)
            self.PROT_DB.cursor().execute(
                "CREATE INDEX index_name ON mapp (foreign_id);")

        # For lncRNA and miRNA
        if self.layer == 'lncRNA' or self.layer == 'miRNA':
            self.LNCRNAMAP_DBname = LNCRNAMAP_DBname
            if self.LNCRNAMAP_DBname is not None:
                # Read database to tempfile
                self.con = sqlite3.connect(self.LNCRNAMAP_DBname)
                tempfile = io.StringIO()
                for line in self.con.iterdump():
                    tempfile.write('%s\n' % line)
                self.con.close()
                tempfile.seek(0)

                # Create a database in memory and import from tempfile
                self.LNCRNAMAP_DB = sqlite3.connect(":memory:")
                with self.LNCRNAMAP_DB:
                    self.LNCRNAMAP_DB.cursor().executescript(tempfile.read())
                    self.LNCRNAMAP_DB.cursor().execute(
                        "CREATE INDEX index_name ON mapper (orig_ac);")
            else:
                self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION)

        self.new_db = PsimiSQL(self.SQL_SEED_LOCATION)
        # iterating through the old_db's nodes
        self.source_db = sqlite3.connect(self.SOURCE_DB_LOCATION)
        self.source_db.row_factory = sqlite3.Row
        self.cur = self.source_db.cursor()
Example #20
0
def main(logger):
    TCR_DATA_FILE = open(TCR_DATA_LOC, encoding="ISO-8859-1")
    # Skipping the header line, and assigning the files's content to a list
    lines = TCR_DATA_FILE.readline()

    # Initiating a PsimiSQL object
    parser = PsimiSQL(SQL_SEED)

    for line in TCR_DATA_FILE:
        cells = line.split('\t')

        # Storing the needed properties in variables
        name_a = cells[1].strip()
        name_b = cells[3].strip()

        alt_accession_a = cells[0]
        alt_accession_b = cells[2]

        if name_a == '':
            continue

        # Building the node dictionaries, and inserting them to the db with the parser
        node_a_dict = {
            'name': "Uniprot:" + name_a,
            'alt_accession': "entrez gene/locuslink:" + alt_accession_a,
            'tax_id': "taxid:9606",
            'pathways': "T-cell receptor",
            'aliases': None
        }

        parser.insert_node(node_a_dict)

        if name_b == '':
            continue

        node_b_dict = {
            'name': "Uniprot:" + name_b,
            'alt_accession': "entrez gene/locuslink:" + alt_accession_b,
            'tax_id': "taxid:9606",
            'pathways': "T-cell receptor",
            'aliases': None
        }

        parser.insert_node(node_b_dict)

        # Gathering the edge's properies, and inserting the edge to the db

        interaction_direction = IS_DIRECT_MAP[cells[5].lower()]
        interaction_effect = EFFECT_MAP[cells[6].lower().strip()]

        pubmed_ids = cells[8]

        interaction_types = "%s|is_directed:%s|is_direct:%s" % (
            interaction_effect, "true", interaction_direction)

        edge_dict = {
            'interaction_detection_method': None,
            'first_author': None,
            'publication_ids': get_mitab_publication_list_string(pubmed_ids),
            'interaction_types': interaction_types,
            'source_db': "TCRcuration",
            'interaction_identifiers': None,
            'confidence_scores': None,
            'layer': '8'
        }

        parser.insert_edge(node_a_dict, node_b_dict, edge_dict)

    # Saving the db to a files
    parser.save_db_to_file(DESTINATION)
Example #21
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    node_names_to_id = {}

    with open(DATA_FILE, encoding='ISO-8859-1') as data:

        # Skipping the header
        data.readline()
        data.readline()
        data.readline()
        data.readline()

        skipped_lines = 0
        lines = 0
        for line in data:
            lines += 1
            if lines % 50000 == 0:
                print("processed lines (PTMCode2): %d" % lines)
            columns = line.split('\t')
            if len(columns) != 14:
                logger.debug("number of colums not 14: %s" % line)
                continue
            if columns[2] == 'H**o sapiens' or columns[2] == 'Drosophila melanogaster' or columns[2] == 'Danio rerio' \
                    or columns[2] == 'Caenorhabditis elegans':
                taxid = ORGANISM_NAME_TO_MITAB_ID_MAP[columns[2]]

                # Getting rid of beta'Cop because it can not be mapped due to syntax error
                if columns[0] != "beta'Cop" and columns[1] != "beta'Cop":
                    # Creating the node dicts, if the node is already in the db assigning that to the node dict
                    source_dict = insert_or_get_node_dict(
                        columns[0].strip().replace(" ", ""), "GeneCards",
                        taxid, node_names_to_id, db_api)
                    target_dict = insert_or_get_node_dict(
                        columns[1].strip().replace(" ", ""), "GeneCards",
                        taxid, node_names_to_id, db_api)

                    if not source_dict or not target_dict:
                        skipped_lines += 1
                        continue

                    interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                        % ('MI:0190(interaction type)', "true", 'false')

                    edge_dict = {
                        'publication_ids': 'pubmed:25361965',
                        'layer': '2',
                        'source_db': DB_TYPE,
                        'interaction_identifiers': None,
                        'confidence_scores': None,  # if available
                        'interaction_detection_method':
                        None,  # probably exp type
                        'interaction_types': interaction_types,
                        'first_author': None
                    }

                    db_api.insert_edge(source_dict, target_dict, edge_dict)
        print("processed lines (PTMCode2): %d" % lines)
        print("skipped lines (malformed IDs): %d" % skipped_lines)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #22
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    # Parsing data file
    for file in DATA_FILE_LIST:
        print(file)
        with open(file, encoding="ISO-8859-1") as data:
            # Skipping header
            data.readline()
            data.readline()
            data.readline()
            data.readline()
            data.readline()
            data.readline()

            for line in data:
                line = line.strip().split('\t')
                if len(
                        line
                ) < 4:  # Probably because of conversion from xlsx to tsv
                    continue
                # Mapping species to taxid
                if line[0] == 'human':
                    taxid_source = 'taxid:9606'
                else:
                    pass
                if line[2] == 'human':
                    taxid_target = 'taxid:9606'
                else:
                    pass

                # Creating the node dicts, if the node is already in the db assigning that to the node dict
                source_dict = get_node_a('Uniprot:' + line[1], taxid_source,
                                         db_api)
                target_dict = get_node_b('Uniprot:' + line[3], taxid_target,
                                         db_api)

                # Nodes are inserted to the db if they are not in it yet
                if not 'id' in source_dict:
                    db_api.insert_node(source_dict)

                if not 'id' in target_dict:
                    db_api.insert_node(target_dict)

                # Mapping interaction identifiers
                # Directed/undirected
                if line[5] == 'D':
                    directed = 'directed'
                else:
                    directed = 'undirected'
                # Direct/indirect
                if line[7] == 'D':
                    direct = 'MI:0407(directed)'
                else:
                    direct = 'MI:2246(indirect)'
                # Stimulation/inhibition
                if line[8] == 'S' or line[8] == 's':
                    stimulation = 'MI:0840(stimulator)'
                elif line[8] == 'I':
                    stimulation = 'MI:0586(inhibitor)'
                else:
                    pass
                # Molecular background
                molec_map = {
                    'P': 'MI:0217(phosphorylation reaction)',
                    'Acetylation': 'MI:0192(acetylation reaction)',
                    'degradation (ubiquitinilation)':
                    'MI:0220(ubiquitination reaction)',
                    'autoP': 'MI:0217(phosphorylation reaction)',
                    'csak beköt': 'MI:0462(bind)',
                    'proteolízis': 'MI:0414(enzymatic reaction)',
                    'proteolízis ("delipidálás")':
                    'MI:0414(enzymatic reaction)',
                    '"proteolízis (""delipidálás"")"':
                    'MI:0414(enzymatic reaction)',
                    'E2 - kovalens tioészter kötés':
                    'MI:0195(covalent binding)',
                    'kovalens': 'MI:0195(covalent binding)',
                    'kovalens tioészter kötés': 'MI:0195(covalent binding)',
                    'E1 - kovalens tioészter kötés':
                    'MI:0195(covalent binding)',
                    'E1-E2 komplex': 'MI:0195(covalent binding)',
                    '': ''
                }

                # Constructing interaction data line
                int_types = '|'.join([
                    stimulation, molec_map[line[9]], 'is_direct:' + 'true',
                    'is_directed:' + 'true'
                ])

                edge_dict = {
                    'publication_ids': 'pubmed:' + line[4],
                    'layer': '1',
                    'source_db': 'manual curation',
                    'interaction_identifiers': None,
                    'confidence_scores': None,
                    'interaction_detection_method': None,
                    'interaction_types': int_types,
                    'first_author': None
                }

                db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db file
    db_api.save_db_to_file(DB_DESTINATION)
Example #23
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    # Parsing files
    with open(DATA_FILE) as data:
        data.readline()
        missing_alt_id = 0
        edges_in_known_taxes = 0
        node_names_to_id = {}
        lines = 0

        for line in data:
            columns = line.split('\t')
            if len(columns) < 2:
                continue
            lines += 1
            if lines % 50000 == 0:
                print("processed lines (IntAct): %d" % lines)

            # tax id like: taxid:9606(human), taxid:83333(ecoli), taxid:-1(in vitro)
            tax_id_a = columns[9][:10]
            tax_id_b = columns[10][:10]

            if tax_id_a not in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955') or \
               tax_id_b not in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955'):
                continue

            edges_in_known_taxes += 1

            if is_well_formed_uniprot_id(
                    columns[2]) and is_well_formed_uniprot_id(columns[3]):

                uniprot_id_a = columns[2][10:].strip()
                uniprot_id_b = columns[3][10:].strip()

                # Creating the node dicts, if the node is already in the db assigning that to the node dict
                source_dict = insert_or_get_node_dict(uniprot_id_a, 'Uniprot',
                                                      tax_id_a,
                                                      node_names_to_id, db_api)
                target_dict = insert_or_get_node_dict(uniprot_id_b, 'Uniprot',
                                                      tax_id_b,
                                                      node_names_to_id, db_api)

                # interaction detection methods: psi-mi:"MI:0096"(pull down)|psi-mi:"MI:0018"(two hybrid)
                detection_methods = columns[6].split("|")
                detection_methods = filter(
                    lambda x: x.strip().lower().startswith('psi-mi:'),
                    detection_methods)
                detection_methods = map(
                    lambda x: x.strip()[7:].replace("\"", "").strip(),
                    detection_methods)
                detection_methods = set(detection_methods)

                # pubmed ids: pubmed:10887206|mint:MINT-5212759
                pubmed_ids = columns[8].split("|")
                pubmed_ids = filter(
                    lambda x: x.strip().lower().startswith('pubmed:'),
                    pubmed_ids)
                pubmed_ids = map(lambda x: x.strip()[7:], pubmed_ids)
                pubmed_ids = filter(lambda x: re.search("^\\d+$", x),
                                    pubmed_ids)
                pubmed_ids = set(pubmed_ids)
                pubmed_ids.add("24234451")  # intact publication
                pubmed_ids = map(lambda x: "pubmed:" + x, pubmed_ids)

                # interaction type: psi-mi:"MI:0407"(direct interaction)|psi-mi:"MI:0915"(physical association)
                interaction_type_terms = columns[11].split("|")
                interaction_type_terms = filter(
                    lambda x: x.strip().lower().startswith('psi-mi:'),
                    interaction_type_terms)
                interaction_type_terms = map(
                    lambda x: x.strip()[7:].replace("\"", "").strip(),
                    interaction_type_terms)
                interaction_type_terms = set(interaction_type_terms)

                # we remove 'MI:0407(direct interaction)' term, as it is redundant with the is_direct attribute
                interaction_type_terms.discard("MI:0407(direct interaction)")

                interaction_type = "is_directed:false|is_direct:true"
                if len(interaction_type_terms) > 0:
                    interaction_type += "|" + "|".join(interaction_type_terms)

                # interaction score examples in the IntAct input file:
                # - intact-miscore:0.558037
                # - author score:low
                # - author score:Retest score=6; Class=Core; confidence score set1/set2 =2
                # - author score:"Socio-affinity score: 6.11118"
                # - author-confidence:Z-score = 17.60
                # - replication-based confidence:4
                # we don't keep the author-type scores, as those are a mess and also contains several non-numeric scores
                confidence_scores = columns[14].split("|")
                confidence_scores = map(lambda x: x.strip(), confidence_scores)
                confidence_scores = filter(
                    lambda x: not x.startswith("author score:") and not x.
                    startswith("author-confidence:"), confidence_scores)
                confidence_scores = map(
                    lambda x: x.replace("intact-miscore", "intact miscore"),
                    confidence_scores)
                confidence_scores = map(
                    lambda x: x
                    if x.lower().startswith("intact") else "intact " + x,
                    confidence_scores)
                confidence_scores = set(confidence_scores)

                edge_dict = {
                    'publication_ids': "|".join(pubmed_ids),
                    'layer': '3',
                    'source_db': "IntAct",
                    'interaction_identifiers': None,
                    'confidence_scores': "|".join(confidence_scores),
                    'interaction_detection_method':
                    "|".join(detection_methods),
                    'interaction_types': interaction_type,
                    'first_author': None
                }

                db_api.insert_edge(source_dict, target_dict, edge_dict)
            else:
                missing_alt_id += 1

        print("processed lines (IntAct): %d" % lines)
        print("number of links in the known species: %d" %
              edges_in_known_taxes)
        print("links with missing uniprot ID: %d" % missing_alt_id)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #24
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    path_dict = {}

    # Uncomment if using run_all_auto
    # data = DATA_FILE.split('\n')

    # Getting only human data
    # path_map = UNI_TO_PATHWAY.split('\n')

    with open(DATA_FILE) as data, open(UNI_TO_PATHWAY) as path_map:

        for line in path_map:

            line = line.strip().split('\t')

            if len(line) > 4:
                if line[5] == 'H**o sapiens':
                    path_dict[line[0]] = line[1]

        data.readline()

        reactome_to_signalink_pathway_map = {}

        pathway_file = open(PATHWAY_FILE_LOCATION)
        next(pathway_file)

        for line in pathway_file:
            reactome_pathway_id, signalink_pathway = line.strip().split('\t')
            reactome_to_signalink_pathway_map[
                reactome_pathway_id] = signalink_pathway

        node_names_to_id = {}
        for line in data:

            columns = line.strip().split('\t')

            if len(columns) > 1:

                id_a = columns[0].strip().split(":")[1]
                id_type_a = columns[0].strip().split(":")[0]
                id_b = columns[1].strip().split(":")[1]
                id_type_b = columns[1].strip().split(":")[0]
                # Building the pathway dict for SLK3 pathways

                if not id_a in path_dict.keys() or not id_b in path_dict.keys(
                ):
                    continue

                if not path_dict[id_a] in reactome_to_signalink_pathway_map \
                        or not path_dict[id_b] in reactome_to_signalink_pathway_map:
                    continue

                interactor_a_tax_id = columns[9].split("(")[0]
                interactor_b_tax_id = columns[10].split("(")[0]
                if (interactor_a_tax_id !=
                        "taxid:9606") or (interactor_b_tax_id != "taxid:9606"):
                    continue

                # Creating the node dicts, if the node is already in the db assigning that to the node dict
                source_dict = insert_or_get_node_dict(
                    id_a, id_type_a, columns[2], columns[4],
                    interactor_a_tax_id,
                    reactome_to_signalink_pathway_map[path_dict[id_a]],
                    node_names_to_id, db_api)
                target_dict = insert_or_get_node_dict(
                    id_b, id_type_b, columns[3], columns[5],
                    interactor_b_tax_id,
                    reactome_to_signalink_pathway_map[path_dict[id_b]],
                    node_names_to_id, db_api)

                # Setting up the interaction type
                effect = columns[11].replace('psi-mi:', '').replace('"', '')
                interaction_types = "%s|is_directed:%s|is_direct:%s" \
                                    % (effect, 'true', 'false')

                if columns[8] != '-':
                    pubmed = columns[8].split("|")
                    pubmed.append("pubmed:29145629")
                    pubmed_ids = "|".join(pubmed)
                else:
                    pubmed_ids = "pubmed:29145629"

                edge_dict = {
                    'publication_ids':
                    pubmed_ids,
                    'layer':
                    '8',
                    'source_db':
                    'Reactome',
                    'interaction_identifiers':
                    None,
                    'confidence_scores':
                    columns[14].split("(")[0],
                    'interaction_detection_method':
                    columns[6].replace('psi-mi:', '').replace('"', ''),
                    'interaction_types':
                    interaction_types,
                    'first_author':
                    columns[7]
                }

                db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #25
0
def main(logger):
    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    with open(DATA_FILE) as data:
        # Skipping the header
        data.readline()
        node_names_to_id = {}
        lines = 0

        for line in data:
            columns = line.strip().split('\t')
            lines += 1
            if lines % 50000 == 0:
                print("processed lines (OmniPath): %d" % lines)

            # Creating the node dicts, if the node is already in the db assigning that to the node dict
            source_dict = insert_or_get_node_dict(columns[0], "Uniprot",
                                                  'taxid:9606',
                                                  node_names_to_id, db_api)
            target_dict = insert_or_get_node_dict(columns[1], "Uniprot",
                                                  'taxid:9606',
                                                  node_names_to_id, db_api)

            # the link is indirect, unless it is directed or if it has a role as inhibitor or stimulator
            direct = 'false'

            if columns[2] == '1':
                directed = 'true'
                direct = 'true'
            elif columns[2] == '0':
                directed = 'false'
            else:
                print("WARNING: unknown direction flag in line: " + line)

            interaction_type_terms = []

            if columns[3] == '1':
                interaction_type_terms.append('MI:0624(stimulant)')
                direct = 'true'

            if columns[4] == '1':
                interaction_type_terms.append('MI:0623(inhibition)')
                direct = 'true'

            interaction_types = "is_directed:%s|is_direct:%s" % (directed,
                                                                 direct)
            if len(interaction_type_terms) > 0:
                interaction_types += "|" + "|".join(interaction_type_terms)

            pubmed_ids = map(lambda x: x.strip(), columns[7].split(';'))
            pubmed_ids = filter(lambda x: re.search("^\\d+$", x), pubmed_ids)
            pubmed_ids = set(pubmed_ids)
            pubmed_ids.add("27898060")  # OmniPath publication
            pubmed_ids = map(lambda x: "pubmed:" + x, pubmed_ids)

            edge_dict = {
                'publication_ids': "|".join(pubmed_ids),
                'layer': '3',
                'source_db': 'OmniPath',
                'interaction_identifiers': None,
                'confidence_scores': None,
                'interaction_detection_method': None,
                'interaction_types': interaction_types,
                'first_author': None
            }

            db_api.insert_edge(source_dict, target_dict, edge_dict)

        print("processed lines (OmniPath): %d" % lines)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(EXPORT_DB_LOCATION)
Example #26
0
def main(logger):
    db_api = PsimiSQL(SQL_SEED)

    node_names_to_id = {}
    for file in DATA_FILE_LIST:
        print("processing data file: " + file)
        with open(file) as data:

            # Skipping the header
            data.readline()
            data.readline()
            data.readline()
            data.readline()

            metainfo = FILE_DICT[file.split("/")[-1]]

            for line in data:
                columns = line.split('\t')
                if len(columns) < 2:
                    continue

                rna_id = columns[metainfo['rna_id_column']].strip()
                if metainfo['rna_id_type'] == 'miRBase' and (
                        rna_id.endswith("-3p") or rna_id.endswith("-5p")):
                    # during the rna mapping, we dont care about the 3p/5p postfix
                    rna_id = rna_id[:-3]
                if metainfo['rna_id_type'] == 'HGNC':
                    rna_id = rna_id.lower()

                # The wormBase IDs in the mapping DB contains only uppercase IDs
                gene_id = columns[metainfo['gene_id_column']].strip()
                if metainfo['gene_id_type'] == 'WormBase':
                    gene_id = gene_id.upper()

                source_dict = insert_or_get_node_dict(rna_id,
                                                      metainfo['rna_id_type'],
                                                      metainfo['tax_id'],
                                                      node_names_to_id, db_api)
                target_dict = insert_or_get_node_dict(gene_id,
                                                      metainfo['gene_id_type'],
                                                      metainfo['tax_id'],
                                                      node_names_to_id, db_api)

                interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)"

                scores = []
                for score_definition in metainfo['scores']:
                    value = columns[score_definition['column']].strip()
                    score_name = score_definition['score_name'].strip()
                    scores.append("%s:%s" % (score_name, value))

                # Inserting edges
                edge_dict = {
                    'publication_ids':
                    'pubmed:24297251',  # StarBase v2.0 publication
                    'layer': '5',
                    'source_db': 'StarBase',
                    'interaction_identifiers': None,
                    'confidence_scores': "|".join(scores),
                    'interaction_detection_method':
                    metainfo['detection_method'],
                    'interaction_types': interaction_types,
                    'first_author': None
                }

                db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
    print("StarBase finished " + DB_DESTINATION)
Example #27
0
def main(logger):

    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    node_names_to_id = {}

    with open(DATA_FILE) as data:
        # Skipping the header
        data.readline()

        for line in data:
            columns = line.split('\t')
            if columns[4].strip().lower() in ORGANISM_TO_TAXID:
                source_dict = insert_or_get_node_dict(
                    columns[0], "HGNC",
                    ORGANISM_TO_TAXID[columns[4].strip().lower()],
                    node_names_to_id, db_api)
                target_dict = insert_or_get_node_dict(
                    columns[1], "HGNC",
                    ORGANISM_TO_TAXID[columns[4].strip().lower()],
                    node_names_to_id, db_api)

                interaction_types = "is_directed:true|is_direct:true|MI:0407(direct interaction)"

                detmap = {
                    'pull-down assay':
                    'MI:0096(pull down)',
                    'qPCR, Western blot, RIP.':
                    'MI:1195(quantitative pcr)|MI:0113(western blot)|MI:1017(rna immunoprecipitation)',
                    'qRT-PCR, RNAi':
                    'MI:1196(quantitative reverse transcription pcr)',
                    'in vitro':
                    'MI:0045(experimental interaction detection)',
                    'In vitro':
                    'MI:0045(experimental interaction detection)',
                    'Luciferase reporter assay, Pulldown assay ':
                    'MI:2285(miRNA interference luciferase reporter assay)|MI:0096(pull down)',
                    'luciferase reporter assays and pull-down assay':
                    'MI:2285(miRNA interference luciferase reporter assay)|MI:0096(pull down)',
                    'RIP':
                    'MI:1017(rna immunoprecipitation)',
                    'luciferase reporter constructs':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'in vitro and vivo':
                    'MI:0045(experimental interaction detection)',
                    'dual luciferase reporter assay?':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'dual luciferase reporter assays':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'qPCR, RNAi etc.':
                    'MI:1195(quantitative pcr)',
                    'ISH and Luciferase Assay':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'In vitro RNA/dsDNA binding assay utilizing biotin tagged RNA oligos as bait':
                    'MI:0045(experimental interaction detection)',
                    'In vitro RNA/dsDNA binding assay':
                    'MI:0045(experimental interaction detection)',
                    'biotin-avidin pull-down system':
                    'MI:0096(pull down)',
                    'microRNA crosslinking and immunoprecipitation (miR-CLIP)':
                    'MI:2191(clip)',
                    'Luciferase reporter assay and qPCR':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'RNA immunoprecipitation;luciferase reporter assays':
                    'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)',
                    'in vivo':
                    'MI:0045(experimental interaction detection)',
                    'luciferase reporter assays':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'RNA immunoprecipitation and luciferase reporter assays':
                    'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)',
                    'EMSA':
                    'MI:0413(electrophoretic mobility shift assay)',
                    'luciferase reporter assay':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'Luciferase assays':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    '-':
                    'MI:0045(experimental interaction detection)',
                    'RNA immunoprecipitation':
                    'MI:1017(rna immunoprecipitation)',
                    'RIP, Biotin-RNA Pull-Down Assay,qRT-PCR,EMSA':
                    'MI:1017(rna immunoprecipitation)|MI:0096(pull down)|MI:0413(electrophoretic mobility shift assay)|MI:1196(quantitative reverse transcription pcr)',
                    'Luciferase reporter assay, RIP assay and RNA pull-down assay':
                    'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)|MI:0096(pull down)',
                    'qPCR, Western blot and RNAi':
                    'MI:1195(quantitative pcr)|MI:0113(western blot)',
                    'luciferase  reporter  assay':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'CLIP':
                    'MI:2191(clip)',
                    'RIP and ChIP assay ':
                    'MI:1017(rna immunoprecipitation)',
                    'in vitro or vivo':
                    'MI:0045(experimental interaction detection)',
                    'RNA pull-down assay':
                    'MI:0096(pull down)',
                    'immunoprecipitation (RIP) assay and RNA pull-down assay':
                    'MI:1017(rna immunoprecipitation)|MI:0096(pull down)',
                    'luciferase reporter':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'in vitro and in vivo':
                    'MI:0045(experimental interaction detection)',
                    'in viro':
                    'MI:0045(experimental interaction detection)',
                    'co-RNA-FISH assays':
                    'MI:0045(experimental interaction detection)',
                    'luciferase reporter ':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'microarray, qPCR':
                    'MI:1195(quantitative pcr)',
                    'In vitro and in vivo':
                    'MI:0045(experimental interaction detection)',
                    'Luciferase reporter assays':
                    'MI:2285(miRNA interference luciferase reporter assay)',
                    'RIP and Luciferase assays':
                    'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)',
                    'RNA-FISH':
                    'MI:0045(experimental interaction detection)',
                    'RNA FISH':
                    'MI:0045(experimental interaction detection)',
                    'FISH, Allele-specific RT-PCR':
                    'MI:1196(quantitative reverse transcription pcr)',
                    'RIP and RNA pull-down':
                    'MI:1017(rna immunoprecipitation)',
                    'RIP and ChIP assay':
                    'MI:0019(coimmunoprecipitation)'
                }

                detmethod = None
                if columns[8].strip() in detmap:
                    detmethod = detmap[columns[8].strip()]
                else:
                    print("WARNING: unknown detection method: " +
                          columns[8].strip())

                pubmed_ids = ['28529080']  # lncRInter publication
                pubmed_id = columns[9].strip()
                if len(pubmed_id) > 0:
                    pubmed_ids.append(pubmed_id)
                pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids))

                # Inserting edges
                edge_dict = {
                    'publication_ids': "|".join(pubmed_ids),
                    'layer': '7',
                    'source_db': 'lncRInter',
                    'interaction_identifiers': None,
                    'confidence_scores': None,
                    'interaction_detection_method': detmethod,
                    'interaction_types': interaction_types,
                    'first_author': None
                }

                db_api.insert_edge(source_dict, target_dict, edge_dict)

    # Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
    print("lncRInter finished")
def main(logger):
    def get_node_a(id, taxid, pathway, alias, topology, psi_mi_to_sql_object):
        """
        This function sets up a node dict and returns it.
        If the node is already in the SQLite database it fetches that node from the db, so it won't be inserted multiple times.
        """

        # Testing if the node is already in the database

        node_dict = psi_mi_to_sql_object.get_node(id, node_tax_id=taxid)

        if not node_dict:
            node_dict = {
                "name": id,
                "tax_id": taxid,
                "alt_accession": None,
                'pathways': pathway,
                "aliases": alias,
                "topology": topology
            }

        return node_dict

    def get_node_b(id, taxid, pathway, alias, topology, psi_mi_to_sql_object):
        """
        This function sets up a node dict and returns it. If the node is already in the SQLite database it fetches that node from the db, so it won't be inserted multiple times.

        """

        # Testing if the node is already in the database
        node_dict = psi_mi_to_sql_object.get_node(id, node_tax_id=taxid)

        if not node_dict:
            node_dict = {
                "name": id,
                "tax_id": taxid,
                "alt_accession": None,
                'pathways': pathway,
                "aliases": alias,
                "topology": topology
            }

        return node_dict

    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)

    # Parsing data file
    with open(DATA_FILE) as data:
        # Skipping the header
        data.readline()

        for line in data:
            line = line.strip().split(';')
            # Taxid
            if line[2] == '9606':
                taxid_source = 'taxid:9606'
            else:
                taxid_source = line[2]
            if line[10] == '9606':
                taxid_target = 'taxid:9606'
            else:
                taxid_target = line[10]

            # Pathways
            source_ptw_list = []
            source_ptw_line = line[7].split(',')
            for ptw in source_ptw_line:
                ptw_new = ptw.strip().split('(')[0]
                source_ptw_list.append(ptw_new)

            source_ptw = '|'.join(source_ptw_list)

            target_ptw_list = []
            target_ptw_line = line[15].split(',')
            for ptw in target_ptw_line:
                ptw_new = ptw.strip().split('(')[0]
                target_ptw_list.append(ptw_new)

            target_ptw = '|'.join(target_ptw_list)

            # Topology
            source_topol = '|'.join(line[4].strip().split(','))

            target_topol = '|'.join(line[12].strip().split(','))

            # Creating the node dicts, if the node is already in the db assigning that to the node dict
            source_dict = get_node_a('Uniprot:' + line[1], taxid_source,
                                     source_ptw, line[0], source_topol, db_api)
            target_dict = get_node_b('Uniprot:' + line[9], taxid_target,
                                     target_ptw, line[8], target_topol, db_api)

            # Nodes are inserted to the db if they are not in it yet
            if not 'id' in source_dict:
                db_api.insert_node(source_dict)

            if not 'id' in target_dict:
                db_api.insert_node(target_dict)

            # Mapping layer descriptions to abbreviations
            layer_dict = {
                'Post-translational regulation': '2',
                'Interaction between autophagy proteins': '0',
                'Autophagy regulators': '1'
            }

            # Is directed
            directed_map = {'PPI directed': 'true', 'PPI undirected': 'false'}

            # Is direct
            direct_map = {'direct': 'true'}
            is_direct = direct_map[line[18]]

            # Effect
            effect_map = {'stimulation': 'MI:0624(stimulation)'}

            if line[19] != 'unknown':
                effect = effect_map[line[19]]
                # Constructing interaction data line
                int_types = '|'.join([
                    effect, 'is_directed:' + directed_map[line[17]],
                    'is_direct:' + is_direct
                ])
            else:
                # Constructing interaction data line
                int_types = '|'.join([
                    'is_directed:' + directed_map[line[17]],
                    'is_direct:' + is_direct
                ])

            # Publications
            pubs = '|pubmed:'.join(line[20].split('|'))

            # Sourcedb mapping
            sourcedb_map = {
                'BioGRID': 'TheBiogrid',
                'Behrends et Al. 2010': 'Behrends',
                'direction is predicted': 'Behrends predicted'
            }
            dblist = []
            for db in line[21].split(','):
                sourcedb = db.strip().split('(')[0]
                if 'pmid' not in sourcedb:
                    if sourcedb in sourcedb_map.keys():
                        mysourcedb = sourcedb_map[sourcedb]
                    else:
                        mysourcedb = sourcedb

                    dblist.append(mysourcedb)

            final_source = '|'.join(dblist)

            edge_dict = {
                'publication_ids': 'pubmed:' + pubs,
                'layer': layer_dict[line[16]],
                'source_db': final_source,
                'interaction_identifiers': None,
                'confidence_scores': None,
                'interaction_detection_method': None,
                'interaction_types': int_types,
                'first_author': None
            }

            db_api.insert_edge(source_dict, target_dict, edge_dict)

            # Saving the to a DB_TYPE.db file
        db_api.save_db_to_file(DB_DESTINATION)
Example #29
0
def main(logger):
    # Declaring variables and constants
    inserted_nodes = {}

    # Initiating the parser
    db_api = PsimiSQL(SQL_SEED)
    for file in DATA_FILE_LIST:
        with open(file) as data:
            # Skipping the header
            data.readline()
            data.readline()
            data.readline()
            data.readline()
            for line in data:
                columns = line.split('\t')
                taxid = FILE_TO_TAXID[file]
                if len(columns) != 1:
                    if file == 'lncRNA/databases/starbase/files/starbase_v3_miRNAlncRNA.txt':
                        mirna_name = columns[1]
                        lncrna_name = columns[3]
                    elif file == 'lncRNA/databases/starbase/files/starbase_v3_ncRNA_degradome_human.txt' \
                            or file == 'lncRNA/databases/starbase/files/starbase_v3_ncRNA_degradome_worm.txt':
                        mirna_name = columns[1]
                        lncrna_name = columns[2]
                    elif file == 'lncRNA/databases/starbase/files/starbase_v3_lncRNA_valid.txt':
                        mirna_name = columns[1]
                        lncrna_name = columns[4]
                    else:
                        mirna_name = None
                        lncrna_name = None
                    # Creating the node dicts, if the node is already in the db assigning that to the node dict
                    source_dict = get_node_lncrna(mirna_name, taxid, db_api)
                    target_dict = get_node_mirna(lncrna_name, taxid, db_api)

                    # Nodes are inserted to the db if they are not in it yet
                    if not 'id' in source_dict:
                        db_api.insert_node(source_dict)

                    if not 'id' in target_dict:
                        db_api.insert_node(target_dict)

                    interaction_types = "effect:%s|is_directed:%s|is_direct:%s" \
                                        % ('MI:0256(rna interference)', 'directed', 'unknown')

                    # Inserting edges
                    edge_dict = {
                        'publication_ids': 'pubmed:24297251',
                        'layer': '7',
                        'source_db': 'starbase',
                        'interaction_identifiers': None,
                        'confidence_scores': None,
                        'interaction_detection_method': file_to_detmet[file],
                        'interaction_types': interaction_types,
                        'first_author': None
                    }

                    db_api.insert_edge(source_dict, target_dict, edge_dict)


# Saving the to a DB_TYPE.db files
    db_api.save_db_to_file(DB_DESTINATION)
Example #30
0
def main(logger):
    file_ = open(RAW_FILE)
    file_.seek(0, os.SEEK_END)

    filesize = file_.tell()
    filesize_mb = filesize / (1024 * 1024)

    # reseting the iterator to the begining of the files
    file_.seek(0)
    file_.readline()

    # Creating a psimi to sql db to every 15Mb of the raw Biogrid files

    # Setting the size of the pice
    mb = 1024 * 1024
    piece_size = 10 * mb

    # The number of the little files
    file_counter = 0

    while file_.tell() < filesize:
        starting_position = file_.tell()
        parser = PsimiSQL(SQL_SEED)

        while file_.tell() < starting_position + piece_size:
            sys.stdout.write(
                "Parsing piece: %d Mb / %d Mb  Total: %d Mb / %d Mb \r" %
                ((file_.tell() - starting_position) /
                 (1024 * 1024), piece_size / (1024 * 1024), file_.tell() /
                 (1024 * 1024), filesize_mb))

            # Dealing with the data
            line = file_.readline()
            cells = line.split("\t")

            try:
                # Extracting node a's properties
                node_a_dict = {
                    'name': extract_property("biogrid", cells[2]),
                    'alt_accession': extract_property("locuslink", cells[2]),
                    'tax_id': cells[9],
                    'pathways': None,
                    'aliases': None
                }

                # Extracting node b's properties
                node_b_dict = {
                    'name': extract_property("biogrid", cells[3]),
                    'alt_accession': extract_property("locuslink", cells[3]),
                    'tax_id': cells[10],
                    'pathways': None,
                    'aliases': None
                }

                # Interaction types
                inttype = cells[11].replace('psi-mi:', '').replace('"', '')

                if inttype == 'MI:0407(direct interaction)':
                    is_direct = inttype
                    effect = 'MI:0190(interaction type)'
                else:
                    is_direct = 'unknown'
                    effect = inttype

                interaction_types = "effect:%s|is_directed:%s|is_direct:%s" \
                                    % (effect, "directed", is_direct)

                # Extracting the edge's properties
                edge_dict = {
                    'interaction_detection_method':
                    cells[6].replace('psi-mi:', '').replace('"', ''),
                    'first_author':
                    cells[7],
                    'publication_ids':
                    cells[8],
                    'interaction_types':
                    interaction_types,
                    'source_db':
                    'biogrid',
                    'interaction_identifiers':
                    None,
                    'confidence_scores':
                    cells[14],
                    'layer':
                    "1"
                }

                # Inserting interactor a to the node table
                parser.insert_node(node_a_dict)

                # Inserting interactor b to the node table
                parser.insert_node(node_b_dict)

                # After insertion the node dictionaries will contain a lastrowid property

                # Inserting edge
                parser.insert_edge(node_a_dict, node_b_dict, edge_dict)

                # Inserting aliases

                #aliases_a = cells[4]
                #aliases_b = cells[5]

                #parser.insert_aliases(node_a_dict,aliases_a)
                #parser.insert_aliases(node_b_dict,aliases_b)
            except IndexError:
                break

        parser.save_db_to_file(DB_DESTINATION + "db_piece_%d" % file_counter)
        parser.db.close()
        sum_files = filesize / piece_size
        #sys.stdout.write('%d / %d SQLite db pieces created\r' % (file_counter, sum_files))
        file_counter += 1

    print("Data insertion is completed")