Esempio n. 1
0
def construct_graph(seq_dict, match_dict, threshold=90):

    uf = UnionFind(seq_dict)
    component_dict = dict()

    for match in match_dict.values():
        q_seq = seq_dict[match.q_name]
        r_seq = seq_dict[match.r_name]
        if match.q_global_identity > threshold or match.r_global_identity > threshold:
            uf.union(q_seq.name, r_seq.name)

    uf.rename_component()

    for seq_name in seq_dict.keys():
        seq = seq_dict[seq_name]

        component_label = uf.component_label[seq_name]
        component_size = uf.component_size[component_label]

        seq.label['component'] = component_label

        component = Component(component_label)
        component.add_member(seq)

        if component_label in component_dict:
            component_dict[component_label].add_member(seq)
        else:
            component_dict[component_label] = component

    return uf, component_dict
Esempio n. 2
0
def gene_isoform_analysis(ref_gtf, seq_dict):

    gene_dict = dict()
    transcript_parent = dict()

    ref_gtf = pd.read_table(ref_gtf, sep='\t', header=None, low_memory=False)
    ref_gtf.columns = [
        'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
        'header'
    ]

    for i, data in ref_gtf.iterrows():
        regex = re.match('transcript_id "(\S+)"; gene_id "(\S+)"',
                         data['header'])
        if regex:
            transcript_name = regex.group(1)
            gene_name = regex.group(2)

            seq = seq_dict[transcript_name]
            seq.label['gene'] = gene_name

            gene = Component(gene_name)
            gene.add_member(seq)

            if gene_name in gene_dict:
                if seq not in gene_dict[gene_name].member:
                    gene_dict[gene_name].add_member(seq)
            else:
                gene_dict[gene_name] = gene

    return gene_dict