コード例 #1
0
def chunk(filename, total_lines, chunksize=500000):
    """Splits the provided file into equal chunks with
    ceiling(num_lines/chunksize) lines each.

    This takes the path to a file and reads through the file, splitting it
    into equal chunks with each of size ceiling(num_lines/chunksize). It
    then returns the number of chunks and sets up the raw_lines table in the
    format: (file, line num, line_chksum, raw_line)

    Args:
        filename (str): the file to split into chunks
        total_lines (int): the number of lines in the file at filename
        args (Namespace): args as populated namespace or 'None' for defaults
        chunksize (int): max size of a single chunk.  Defaults to 500000.

    Returns:
        int: the number of chunks filename was split into
    """
    #determine number of chunks
    if 'lincs.level4' in filename:
        num_chunks = MAX_CHUNKS
    else:
        num_chunks = math.ceil(total_lines / int(chunksize))
    num_lines = int(total_lines / num_chunks)

    #determine file output information
    path, file = os.path.split(filename)
    chunk_dir = os.path.join(path, 'chunks')
    os.makedirs(chunk_dir, exist_ok=True)
    source_alias, ext = os.path.splitext(file)
    chunk_file = os.path.join(chunk_dir, source_alias + '.raw_line.')

    #divide file into chunks
    line_count = 0
    with open(filename, 'rb') as infile:
        for i in range(1, num_chunks + 1):
            curr_chunk = chunk_file + str(i) + ext
            with open(curr_chunk, 'wb') as out:
                j = 0
                for line in infile:
                    line_count += 1
                    hasher = hashlib.md5()
                    hasher.update(source_alias.encode())
                    hasher.update(str(line_count).encode())
                    hasher.update(line)
                    md5 = hasher.hexdigest()
                    outline = '\t'.join(
                        (md5, str(line_count), source_alias, ''))
                    out.write(outline.encode())
                    cleanline = line.decode("ascii", errors="ignore")
                    cleanline = cleanline.replace('\n', '')
                    cleanline = '"' + cleanline + '"\n'
                    out.write(cleanline.encode())
                    j += 1
                    if j == num_lines and i < num_chunks:
                        break
            u_chunk_file = curr_chunk.replace('raw_line', 'unique.raw_line')
            tu.csu(curr_chunk, u_chunk_file)
    return num_chunks
コード例 #2
0
ファイル: go.py プロジェクト: cblatti3/KN_Builder
    def create_mapping_dict(self, filename, key_col=3, value_col=4):
        """Return a mapping dictionary for the provided file.

        This returns a dictionary for use in mapping nodes or edge types from
        the file specified by filetype. By default it opens the file specified
        by filename creates a dictionary using the first column as the key and
        the second column as the value.

        Args:
            filename(str): The name of the file containing the information
                needed to produce the maping dictionary.

        Returns:
            dict: A dictionary for use in mapping nodes or edge types.
        """
        term_map = dict()
        n_type = 'Property'
        n_meta_file = filename.replace('raw_line', 'node_meta')
        node_file = filename.replace('raw_line', 'node')
        orig_id, kn_id, orig_name, kn_name = ['', '', '', '']
        skip = True
        with open(filename) as infile, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            reader = csv.reader(infile, delimiter='\t')
            n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in reader:
                raw = line[3]
                if raw.startswith('[Term]'):
                    skip = False
                    orig_id, kn_id, orig_name, kn_name = ['', '', '', '']
                    continue
                if raw.startswith('[Typedef]'):
                    skip = True
                    continue
                if skip:
                    continue
                if raw.startswith('id: '):
                    orig_id = raw[4:].strip()
                    kn_id = cf.pretty_name(orig_id)
                    continue
                if raw.startswith('name: '):
                    orig_name = raw[6:].strip()
                    kn_name = cf.pretty_name('go_' + orig_name)
                    term_map[orig_id] = kn_id + '::' + kn_name
                    n_writer.writerow([kn_id, kn_name, n_type])
                    n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                    n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
                if raw.startswith('alt_id: '):
                    alt_id = raw[8:].strip()
                    term_map[alt_id] = kn_id + '::' + kn_name
                    n_meta_writer.writerow([kn_id, 'alt_alias', alt_id])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)

        return term_map
コード例 #3
0
    def create_mapping_dict(self, filename, key_col=3, value_col=4):
        """Return a mapping dictionary for the provided file.

        This returns a dictionary for use in mapping nodes or edge types from
        the file specified by filetype. By default it opens the file specified
        by filename creates a dictionary using the key_col column as the key
        and the value_col column as the value.

        Args:
            filename (str): The name of the file containing the information
                needed to produce the maping dictionary.
            key_col (int): The column containing the key for creating the
                dictionary. By default this is column 3.
            value_col (int): The column containing the value for creating the
                dictionary. By default this is column 4.

        Returns:
            dict: A dictionary for use in mapping nodes or edge types.
        """
        src = filename.split('.')[0]
        alias = filename.split('.')[1]
        map_dict = dict()
        n_meta_file = filename.replace('raw_line', 'node_meta')
        node_file = filename.replace('raw_line', 'node')
        if not self.is_map(alias):
            return map_dict
        with open(filename, 'rb') as map_file, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            reader = csv.reader((line.decode('utf-8') for line in map_file),
                                delimiter='\t')
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in reader:
                chksm = line[2]
                orig_id = line[key_col].strip()
                orig_name = line[value_col].strip()
                kn_id = cf.pretty_name(orig_id)
                kn_name = cf.pretty_name(src + '_' + orig_name)
                map_dict[orig_id] = kn_id + '::' + kn_name
                n_writer.writerow([kn_id, kn_name])
                n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
        return map_dict
コード例 #4
0
def format_raw_line(filename):
    """Creates the raw_line table from the provided file and returns the
       path to the output file.

    This takes the path to a file and reads through the file, adding three tab
    separated columns to the beginning, saving to disk, and then returning the
    output file path. Output looks like:
    raw_lines table (line_hash, line_num, file_id, line_str)

    Args:
        filename (str): the file to convert to raw_line table format

    Returns:
        str: the path to the output file
    """
    #determine file output information
    path, file = os.path.split(filename)
    source_alias, ext = os.path.splitext(file)
    raw_line = os.path.join(path, source_alias + '.raw_line' + ext)

    #convert the file to raw_line format
    line_count = 0
    with open(filename, 'rb') as infile:
        with open(raw_line, 'wb') as outfile:
            for line in infile:
                line_count += 1
                hasher = hashlib.md5()
                hasher.update(source_alias.encode())
                hasher.update(str(line_count).encode())
                hasher.update(line)
                md5 = hasher.hexdigest()
                outline = '\t'.join([md5, str(line_count), source_alias, ''])
                outfile.write(outline.encode())
                cleanline = line.decode('ascii', 'ignore')
                outfile.write(cleanline.encode())
    tu.csu(raw_line, raw_line.replace('raw_line', 'unique.raw_line'),
           [1, 2, 3])
    return raw_line
コード例 #5
0
def table(raw_line, version_dict, taxid_list=None):
    """Uses the provided raw_line file to produce a table file, an
    edge_meta file, a node and/or node_meta file (only for property nodes).

    This returns noting but produces the table formatted files from the
    provided raw_line file:
        raw_line (line_hash, line_num, file_id, raw_line)
        table_file (line_hash, n1name, n1hint, n1type, n1spec,
                    n2name, n2hint, n2type, n2spec, et_hint, score,
                    table_hash)
        edge_meta (line_hash, info_type, info_desc)
        node_meta (node_id,
                   info_type (evidence, relationship, experiment, or link),
                   info_desc (text))
        node (node_id, n_alias, n_type)

    Args:
        raw_line(str): The path to the raw_line file
        version_dict (dict): A dictionary describing the attributes of the
            alias for a source.
        taxid_list (list): A list of taxon ids to support

    Returns:
    """
    if taxid_list is None:
        taxid_list = []

    #outfiles
    table_file = raw_line.replace('raw_line', 'table')
    e_meta_file = raw_line.replace('raw_line', 'edge_meta')

    #static column values
    n1type = 'gene'
    n2type = 'gene'
    score = 1
    src_specific_hints = ["intact", "biogrid"]
    #mapping files
    ppi = os.path.join('..', '..', 'ppi', 'obo_map', 'ppi.obo_map.json')
    with open(ppi) as infile:
        term_map = json.load(infile)

    with open(raw_line, encoding='utf-8') as infile, \
        open(table_file, 'w') as edges,\
        open(e_meta_file, 'w') as e_meta:
        edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n')
        e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n')
        for line in infile:
            line = line.replace('"', '').strip().split('\t')
            if len(line) == 1:
                continue
            if line[1] == '1':
                continue
            chksm = line[0]
            raw = line[3:]
            n1list = raw[0].split('|') + raw[2].split('|')
            n2list = raw[1].split('|') + raw[3].split('|')
            if not n1list or not n2list:
                continue
            match = re.search(r'taxid:(\d+)', raw[9])
            if match is not None:
                n1spec = match.group(1)
                if taxid_list and n1spec not in taxid_list:
                    continue
            else:
                continue
            match = re.search(r'taxid:(\d+)', raw[10])
            if match is not None:
                n2spec = match.group(1)
                if taxid_list and n2spec not in taxid_list:
                    continue
            else:
                continue
            if len(raw) > 35 and raw[35].upper() == 'TRUE':
                et_hint = 'PPI_negative'
            else:
                match = re.search(r'(MI:\d+)', raw[11])
                if match is not None:
                    et_hint = term_map[match.group(1)]
                else:
                    continue
            for n1tuple in n1list:
                if n1tuple.count(':') != 1:
                    continue
                n1hint, n1id = n1tuple.split(':')
                if n1hint in src_specific_hints:
                    continue
                for n2tuple in n2list:
                    if n2tuple.count(':') != 1:
                        continue
                    n2hint, n2id = n2tuple.split(':')
                    if n2hint in src_specific_hints:
                        continue
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec,\
                        n2id, n2hint, n2type, n2spec, et_hint, str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec, \
                        n2id, n2hint, n2type, n2spec, et_hint, score, t_chksum])

            publist = raw[8]
            interaction_id = raw[13]
            e_meta_writer.writerow([chksm, 'reference', publist])
            e_meta_writer.writerow([chksm, 'detail', interaction_id])

    outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta')
    tu.csu(e_meta_file, outfile)
コード例 #6
0
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        #n_meta_file = raw_line.replace('raw_line', 'node_meta')
        e_meta_file = raw_line.replace('raw_line', 'edge_meta')

        #static column values
        n1type = 'gene'
        n1hint = 'unknown'
        n2type = 'gene'
        n2hint = 'unknown'
        info_type = 'combined_score'
        edge_types = {2: 'STRING_neighborhood',
                      3: 'STRING_fusion',
                      4: 'STRING_cooccurence',
                      5: 'STRING_coexpression',
                      6: 'STRING_experimental',
                      7: 'STRING_database',
                      8: 'STRING_textmining'}

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(e_meta_file, 'w') as e_meta:
            edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n')
            e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                if line[1] == '1':
                    continue
                chksm = line[0]
                raw = line[3].split(' ')
                n1list = raw[0].split('.')
                n2list = raw[1].split('.')
                if len(n1list) < 2 or len(n2list) < 2:
                    continue
                n1spec = n1list[0]
                n1id = '.'.join(n1list[1:])
                n2spec = n2list[0]
                n2id = '.'.join(n2list[1:])
                for ety in edge_types:
                    et_hint = edge_types[ety]
                    score = raw[ety]
                    if score == '0':
                        continue
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec,
                                             n2id, n2hint, n2type, n2spec, et_hint,
                                             str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec,
                                          n2id, n2hint, n2type, n2spec, et_hint,
                                          score, t_chksum])
                c_score = raw[9]
                e_meta_writer.writerow([chksm, info_type, c_score])
        outfile = e_meta_file.replace('edge_meta', 'unique_edge_meta')
        tu.csu(e_meta_file, outfile, [1, 2, 3])
コード例 #7
0
ファイル: pfam_prot.py プロジェクト: cblatti3/KN_Builder
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')
        #e_meta_file = raw_line.replace('raw_line', 'edge_meta')

        #static column values
        n1type = 'property'
        n_type = 'Property'
        n2type = 'gene'
        n1hint = 'Pfam/Family'
        n2hint = 'Uniprot_gn'
        et_hint = 'pfam_prot'
        n1spec = '0'
        map_dict = dict()
        src = 'pf'

        ###Map the file name
        species = (os.path.join('..', '..', 'id_map', 'species',
                                'species.json'))
        with open(species) as infile:
            species_map = json.load(infile)
        n2spec = version_dict['alias']

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            edge_writer = csv.writer(edges,
                                     delimiter='\t',
                                     lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split()
                if len(line) == 1:
                    continue
                chksm = line[0]
                raw = line[3:]

                # skip commented lines
                comment_match = re.match('#', raw[0])
                if comment_match is not None:
                    continue

                orig_id = raw[5].strip()
                orig_name = raw[6].strip()
                kn_id = cf.pretty_name(src + '_' + orig_id)
                kn_name = cf.pretty_name(src + '_' + orig_name)
                map_dict[orig_id] = kn_id + '::' + kn_name
                n_writer.writerow([kn_id, kn_name, n_type])
                n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
                n2orig = raw[0]
                evalue = raw[12]
                evalue = float(evalue)
                score = self.sc_min
                if evalue == 0.0:
                    score = self.sc_max
                if evalue > 0.0:
                    score = round(-1.0 * math.log10(evalue), 4)
                if score > self.sc_max:
                    score = self.sc_max
                if score < self.sc_min:
                    continue

                output = [
                    chksm, kn_id, n1hint, n1type, n1spec, n2orig, n2hint,
                    n2type, n2spec, et_hint,
                    str(score)
                ]
                hasher = hashlib.md5()
                hasher.update('\t'.join(output).encode())
                t_chksum = hasher.hexdigest()
                edge_writer.writerow(output + [t_chksum])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
コード例 #8
0
def main(tablefile, args=None):
    """Maps the nodes for the source:alias tablefile.

    This takes the path to an tablefile (see table_utilities.main) and maps
    the nodes in it using the Redis DB. It then outputs a status files in
    the format (table_hash, n1, n2, edge_type, weight, edge_hash, line_hash,
    status, status_desc), where status is production if both nodes mapped and
    unmapped otherwise. It also outpus an edge file which all rows where status
    is production, in the format (edge_hash, n1, n2, edge_type, weight), and
    and edge2line file in the formate (edge_hash, line_hash).

    Args:
        tablefile (str): path to an tablefile to be mapped
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    if 'lincs.level4' in tablefile or 'lincs.exp_meta' in tablefile:
        if os.path.isfile(tablefile.replace('conv', 'node')):
            iu.import_pnode(tablefile.replace('conv', 'node'), args)
        iu.import_edge(tablefile, args)
        return
    rdb = ru.get_database(args)
    edge_file = tablefile.replace('table', 'edge')
    status_file = tablefile.replace('table', 'status')
    ue_file = tablefile.replace('table', 'unique.edge')
    ue2l_file = tablefile.replace('table', 'unique.edge2line')
    us_file = tablefile.replace('table', 'unique.status')
    src_data_dir = os.path.join(args.working_dir, args.data_path,
                                cf.DEFAULT_MAP_PATH)
    species_file = os.path.join(src_data_dir, 'species', 'species.json')
    with open(species_file, 'r') as infile:
        species_dict = json.load(infile)
    supported_taxids = ['unknown'] + list(species_dict.values())
    with open(tablefile, 'r') as infile, \
        open(edge_file, 'w') as edge, \
        open(status_file, 'w') as e_stat:
        reader = csv.reader(infile, delimiter='\t')
        s_writer = csv.writer(e_stat, delimiter='\t', lineterminator='\n')
        e_writer = csv.writer(edge, delimiter='\t', lineterminator='\n')
        to_map = defaultdict(list)
        for line in reader:
            (n1, hint, ntype, taxid) = line[1:5]
            if ntype == 'gene' and taxid in supported_taxids:
                to_map[hint, taxid].append(n1)
            (n2, hint, ntype, taxid) = line[5:9]
            if ntype == 'gene' and taxid in supported_taxids:
                to_map[hint, taxid].append(n2)
        infile.seek(0)
        mapped = {
            k: {n: m
                for m, n in zip(ru.conv_gene(rdb, v, k[0], k[1]), v)}
            for k, v in to_map.items()
        }
        for line in reader:
            (n1, hint, ntype, taxid) = line[1:5]
            if ntype == 'gene':
                if taxid not in supported_taxids:
                    n1_map = 'unmapped-unsupported-species'
                else:
                    n1_map = mapped[hint, taxid][n1]
            else:
                n1_map = n1
            (n2, hint, ntype, taxid) = line[5:9]
            if ntype == 'gene':
                if taxid not in supported_taxids:
                    n2_map = 'unmapped-unsupported-species'
                else:
                    n2_map = mapped[hint, taxid][n2]
            else:
                n2_map = n2
            chksum = line[0]  #line chksum
            et_map = line[9]
            weight = line[10]
            t_chksum = line[11]  #raw edge chksum
            hasher = hashlib.md5()
            hasher.update('\t'.join([n1_map, n2_map, et_map]).encode())
            e_chksum = hasher.hexdigest()
            if 'unmapped' in n1_map:
                status = 'unmapped'
                status_desc = n1_map
            elif 'unmapped' in n2_map:
                status = 'unmapped'
                status_desc = n2_map
            else:
                status = 'production'
                status_desc = 'mapped'
                e_writer.writerow([e_chksum, n1_map, n2_map, et_map, weight])
            s_writer.writerow([t_chksum, n1_map, n2_map, et_map, weight, e_chksum, \
                chksum, status, status_desc])
    tu.csu(edge_file, ue_file)
    tu.csu(status_file, us_file)
    tu.csu(us_file, ue2l_file, [6, 7])
コード例 #9
0
ファイル: go.py プロジェクト: cblatti3/KN_Builder
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        e_meta_file = raw_line.replace('raw_line', 'edge_meta')

        #static column values
        alias = version_dict['alias']
        source = version_dict['source']
        n1type = 'property'
        n1spec = '0'
        n2type = 'gene'

        info_type1 = 'reference'
        info_type2 = 'evidence'

        #mapping files
        obo_file = os.path.join('..', 'obo_map', 'go.obo_map.json')
        with open(obo_file) as infile:
            obo_map = json.load(infile)

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(e_meta_file, 'w') as e_meta:
            edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n')
            e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                if len(line) == 1:
                    continue
                chksm = line[0]
                raw = line[3:]

                # skip commented lines
                comment_match = re.match('!', raw[0])
                if comment_match is not None:
                    continue

                qualifier = raw[3]
                # skip "NOT" annotations
                not_match = re.search('NOT', qualifier)
                if not_match is not None:
                    continue

                n1orig = raw[4]
                n1_mapped = obo_map.get(n1orig, "unmapped:no-name::unmapped")
                (n1_id, n1hint) = n1_mapped.split('::')

                n2spec_str = raw[12].split("|", 1)[0].rstrip() #only take first species
                n2spec = n2spec_str.split(":", 1)[1] #remove label taxon:
                if n2spec == '559292': #manually overwrite taxid for Scer
                    n2spec = '4932'

                reference = raw[5]
                anno_evidence = raw[6]

                score = 2
                et_hint = 'gene_ontology'
                if anno_evidence == 'IEA':
                    score = 1

                n2_id = raw[1]
                n2hint = raw[0]

                if n2hint == "UniProtKB":
                    n2hint = "uniprot_gn"
                if n1hint == "UniProtKB":
                    n1hint = "uniprot_gn"

                for idx in range(1, 3):  # loop twice
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1_id, n1hint, n1type, n1spec,\
                    n2_id, n2hint, n2type, n2spec, et_hint, str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1_id, n1hint, n1type, n1spec, \
                        n2_id, n2hint, n2type, n2spec, et_hint, score, t_chksum])
                    n2_id = raw[2]

                e_meta_writer.writerow([chksm, info_type1, reference])
                e_meta_writer.writerow([chksm, info_type2, anno_evidence])
            outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta')
            tu.csu(e_meta_file, outfile)
コード例 #10
0
    def create_mapping_dict(self, filename, key_col=3, value_col=4):
        """Return a mapping dictionary for the provided file.

        This returns a dictionary for use in mapping nodes or edge types from
        the file specified by filetype. By default it opens the file specified
        by filename creates a dictionary using the first column as the key and
        the second column as the value.

        Args:
            filename(str): The name of the file containing the information
                needed to produce the maping dictionary.

        Returns:
            dict: A dictionary for use in mapping nodes or edge types.
        """
        src = filename.split('.')[0]
        alias = filename.split('.')[1]
        map_dict = dict()
        n1_type = 'Property'
        n_meta_file = filename.replace('raw_line', 'node_meta')
        node_file = filename.replace('raw_line', 'node')
        if not self.is_map(alias):
            return map_dict

        if alias == 'pathway':
            with open(filename, 'rb') as map_file, \
                open(n_meta_file, 'w') as n_meta, \
                open(node_file, 'w') as nfile:
                reader = csv.reader(
                    (line.decode('utf-8') for line in map_file),
                    delimiter='\t')
                n_meta_writer = csv.writer(n_meta,
                                           delimiter='\t',
                                           lineterminator='\n')
                n_writer = csv.writer(nfile,
                                      delimiter='\t',
                                      lineterminator='\n')
                for line in reader:
                    orig_id = line[3].strip()
                    orig_name = line[4].strip()
                    mod_id = src + '_' + orig_id.replace('map', '')
                    kn_id = cf.pretty_name(mod_id)
                    kn_name = cf.pretty_name(src + '_' + orig_name)
                    map_dict[orig_id] = kn_id + '::' + kn_name
                    n_writer.writerow([kn_id, kn_name, n1_type])
                    n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                    n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
            outfile = node_file.replace('node', 'unique.node')
            tu.csu(node_file, outfile)
            outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
            tu.csu(n_meta_file, outfile)

        else:
            with open(filename, 'rb') as map_file:
                reader = csv.reader(
                    (line.decode('utf-8') for line in map_file),
                    delimiter='\t')
                for line in reader:
                    orig_id = line[3].strip()
                    orig_name = line[4].strip()
                    mod_id = src + '_' + orig_id
                    kn_id = orig_name.split(':')[1]
                    kn_name = 'EntrezGene'
                    map_dict[mod_id] = kn_id + '::' + kn_name

        return map_dict
コード例 #11
0
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')
        e_meta_file = raw_line.replace('raw_line', 'edge_meta')

        #static column values
        n1type = 'gene' #ignoring chemicals
        n1hint = 'UNIPROT_GN'
        n1spec = 'unknown'
        n2type = n1type #ignoring chemicals
        n2hint = n1hint
        n2spec = n1spec
        n3_type = 'property'
        n3hint = 'unknown'
        n3spec = 'unknown'
        score = '1'
        n_type = 'Property'

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(e_meta_file, 'w') as e_meta, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n')
            e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n')
            n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                if line[1] == '1': #skip header
                    continue
                chksm = line[0]
                raw = line[3:]
                if len(raw) != 7: #extended information
                    continue
                (n1id, et_hint, n2id, src, publist, n3id, mediator_ids) = raw
                et_hint = 'pathcom_' + et_hint.replace('-', '_')
                #n1-n2 edge
                hasher = hashlib.md5()
                hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec,
                                         n2id, n2hint, n2type, n2spec, et_hint,
                                         score]).encode())
                t_chksum = hasher.hexdigest()
                edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec,
                                      n2id, n2hint, n2type, n2spec, et_hint,
                                      score, t_chksum])
                e_meta_writer.writerow([chksm, 'original_source', src])
                if publist:
                    e_meta_writer.writerow([chksm, 'reference', publist])
                #pathway edge
                if n3id:
                    kn_n3id = cf.pretty_name('paco_' + n3id)
                    n_writer.writerow([kn_n3id, kn_n3id, n_type])
                    n_meta_writer.writerow([kn_n3id, 'orig_id', n3id])
                    for node in [n1id, n2id]:
                        hasher = hashlib.md5()
                        hasher.update('\t'.join([chksm, kn_n3id, n3hint, n3_type,
                                                 n3spec, node, n1hint, n1type, n1spec,
                                                 'pathcom_pathway', score]).encode())
                        t_chksum = hasher.hexdigest()
                        edge_writer.writerow([chksm, kn_n3id, n3hint, n3_type,
                                              n3spec, node, n1hint, n1type, n1spec,
                                              'pathcom_pathway', score, t_chksum])
        outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta')
        tu.csu(e_meta_file, outfile, [1, 2, 3])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
コード例 #12
0
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')

        #static column values
        alias = version_dict['alias']
        source = version_dict['source']
        mouse_aliases = ["MGI_Mammalian_Phenotype_2013", \
                         "MGI_Mammalian_Phenotype_Level_3",\
                         "MGI_Mammalian_Phenotype_Level_4", "Mouse_Gene_Atlas"]
        n1type = 'property'
        n_type = 'Property'
        n1spec = '0'
        n1hint = source + '_' + alias
        n2type = 'gene'
        if alias in mouse_aliases:
            n2spec = '10090'
            n2hint = 'MGI'
        else:
            n2spec = '9606'
            n2hint = 'HGNC'
        (et_hint, node_prefix) = self.aliases[alias].split('::')
        score = 1

        if alias == 'PPI_Hub_Proteins':
            n1type = 'gene'
            n1spec = '9606'
            n1hint = 'HGNC'


        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            edge_writer = csv.writer(edges,
                                     delimiter='\t',
                                     lineterminator='\n')
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                #line = re.split('\s{2,}', line)
                if len(line) == 1:
                    continue
                chksm = line[0]
                raw = line[3:]
                n1_orig_name = raw[0]
                n1_kn_name = n1_orig_name
                if alias != 'PPI_Hub_Proteins':
                    n1_kn_name = cf.pretty_name(node_prefix + '_' +
                                                n1_orig_name)
                    n_meta_writer.writerow(
                        [n1_kn_name, 'orig_desc', n1_orig_name])
                    n_writer.writerow([n1_kn_name, n1_kn_name, n_type])
                for n2_id in raw[1:]:
                    n2_id = n2_id.split(',')[0]
                    if n2_id == '':
                        continue
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1_kn_name, n1hint, n1type, n1spec,\
                        n2_id, n2hint, n2type, n2spec, et_hint,\
                        str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1_kn_name, n1hint, n1type, n1spec, \
                            n2_id, n2hint, n2type, n2spec, et_hint, score, \
                            t_chksum])

        if alias != 'PPI_Hub_Proteins':
            outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
            tu.csu(n_meta_file, outfile)
            outfile = node_file.replace('node', 'unique.node')
            tu.csu(node_file, outfile)
        else:
            os.remove(n_meta_file)
            os.remove(node_file)
コード例 #13
0
ファイル: reactome.py プロジェクト: cblatti3/KN_Builder
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        alias = version_dict['alias']

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        e_meta_file = raw_line.replace('raw_line', 'edge_meta')

        if alias == 'Ensembl2Reactome_All_Levels':

            #static column values
            n1type = 'property'
            n1spec = '0'
            n2type = 'gene'
            n2hint = 'Ensembl_GeneID'
            score = 1

            #mapping files
            pathway = os.path.join('..', 'ReactomePathways',
                                   'reactome.ReactomePathways.json')
            with open(pathway) as infile:
                path_map = json.load(infile)
            species = (os.path.join('..', '..', 'id_map', 'species',
                                    'species.json'))
            with open(species) as infile:
                species_map = json.load(infile)

            with open(raw_line, encoding='utf-8') as infile, \
                open(table_file, 'w') as edges,\
                open(n_meta_file, 'w') as n_meta,\
                open(e_meta_file, 'w') as e_meta:
                edge_writer = csv.writer(edges,
                                         delimiter='\t',
                                         lineterminator='\n')
                n_meta_writer = csv.writer(n_meta,
                                           delimiter='\t',
                                           lineterminator='\n')
                e_meta_writer = csv.writer(e_meta,
                                           delimiter='\t',
                                           lineterminator='\n')
                for line in infile:
                    line = line.replace('"', '').strip().split('\t')
                    if len(line) == 1:
                        continue
                    chksm = line[0]
                    raw = line[3:]
                    n1_orig_id = raw[1]
                    n1_mapped = path_map.get(n1_orig_id,
                                             "unmapped:no-name::unmapped")
                    (n1_id, n1hint) = n1_mapped.split('::')
                    n1_link = raw[2]

                    n2_id = raw[0]
                    n2spec_str = raw[5]
                    n2spec = species_map.get(n2spec_str,
                                             "unmapped:unsupported-species")

                    e_meta = raw[4]
                    score = 2
                    et_hint = 'reactome_annotation'
                    if e_meta == 'IEA':
                        score = 1

                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1_id, n1hint, n1type, n1spec,\
                                             n2_id, n2hint, n2type, n2spec, et_hint,
                                             str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1_id, n1hint, n1type, n1spec, \
                        n2_id, n2hint, n2type, n2spec, et_hint, score, t_chksum])
                    n_meta_writer.writerow([n1_id, 'link', n1_link])
                    e_meta_writer.writerow([chksm, 'evidence', e_meta])
            outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta')
            tu.csu(e_meta_file, outfile)
            outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
            tu.csu(n_meta_file, outfile)
        if alias == 'reactome.homo_sapiens.interactions.tab-delimited':

            #static column values
            n1type = 'gene'
            n1spec = '9606'
            n2type = 'gene'
            n2spec = '9606'
            score = 1

            #mapping files

            with open(raw_line, encoding='utf-8') as infile, \
                open(table_file, 'w') as edges,\
                open(e_meta_file, 'w') as e_meta:
                edge_writer = csv.writer(edges,
                                         delimiter='\t',
                                         lineterminator='\n')
                e_meta_writer = csv.writer(e_meta,
                                           delimiter='\t',
                                           lineterminator='\n')
                for line in infile:
                    line = line.replace('"', '').strip().split('\t')
                    if len(line) == 1:
                        continue
                    chksm = line[0]
                    raw = line[3:]

                    # skip commented lines
                    comment_match = re.match('#', raw[0])
                    if comment_match is not None:
                        continue

                    n1_str = raw[0]
                    n1hint = n1_str.split(':', 1)[0]
                    n1_id = n1_str.split(':', 1)[1]
                    n2_str = raw[3]
                    n2hint = n2_str.split(':', 1)[0]
                    if n2hint == "":
                        continue
                    n2_id = n2_str.split(':', 1)[1]

                    et_str = raw[6]
                    et_hint = 'reactome_PPI_' + et_str

                    detail_str = raw[7]
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1_id, n1hint, n1type, n1spec,\
                                             n2_id, n2hint, n2type, n2spec, et_hint,
                                             str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1_id, n1hint, n1type, n1spec, \
                        n2_id, n2hint, n2type, n2spec, et_hint, score, t_chksum])
                    e_meta_writer.writerow([chksm, 'detail', detail_str])
                    if len(raw) > 8:
                        ref_str = raw[8]
                        e_meta_writer.writerow([chksm, 'reference', ref_str])
            outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta')
            tu.csu(e_meta_file, outfile)
コード例 #14
0
ファイル: msigdb.py プロジェクト: cblatti3/KN_Builder
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')
        #e_meta_file = raw_line.replace('raw_line','edge_meta')

        #static column values
        alias = version_dict['alias']
        source = version_dict['source']
        n1type = 'property'
        n_type = 'Property'
        n1spec = '0'
        n1hint = source + '_' + alias
        n2type = 'gene'
        n2spec = '9606'  # assumption of human genes is occasionally incorrect
        n2hint = 'EntrezGene'
        et_hint = source + '_' + alias.replace(".", "_")
        score = 1

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            edge_writer = csv.writer(edges,
                                     delimiter='\t',
                                     lineterminator='\n')
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                if len(line) == 1:
                    continue
                chksm = line[0]
                raw = line[3:]
                n1_orig_name = raw[0]
                n1_url = raw[1]
                hasher = hashlib.md5()
                hasher.update(n1_orig_name.encode())
                n1_chksum = hasher.hexdigest()
                n1_kn_id = cf.pretty_name('msig_' + n1_chksum)
                n1_kn_name = cf.pretty_name('msig_' + n1_orig_name)
                n1hint = n1_kn_name
                n_meta_writer.writerow([n1_kn_id, 'orig_desc', n1_orig_name])
                n_meta_writer.writerow([n1_kn_id, 'link', n1_url])
                n_writer.writerow([n1_kn_id, n1_kn_name, n_type])
                for n2_id in raw[2:]:
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1_kn_id, n1hint, n1type, n1spec,\
                        n2_id, n2hint, n2type, n2spec, et_hint,\
                        str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1_kn_id, n1hint, n1type, n1spec, \
                            n2_id, n2hint, n2type, n2spec, et_hint, score, \
                            t_chksum])
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)