Example #1
0
 def load_coliner(self, f_coliner):
     group_pattern = re.compile("\s*(\d+)-\s*(\d+)")
     for line in iterline(f_coliner):
         data = line.rstrip().split("\t")
         g_data, cds1, cds2, score = data
         group_id, pair_id = re.findall(group_pattern, g_data)[0]
         score = score.strip()
         cds1 = cds1.rstrip("_CDS")
         cds2 = cds2.rstrip("_CDS")
         if self.genome1.is_in_genome(cds1):
             if self.genome2.is_in_genome(cds2):
                 name1 = cds1
                 name2 = cds2
             else:
                 continue
         else:
             assert self.genome2.is_in_genome(cds1)
             if self.genome1.is_in_genome(cds2):
                 name1 = cds2
                 name2 = cds1
             else:
                 continue
         coliner_rec = ColinerRec(int(group_id), int(pair_id), name1, name2,
                                  score)
         coliner_rec.find_uniq_id(self.genome1, self.genome2)
         self.coliner_dict[coliner_rec.group_id].append(coliner_rec)
         self.pair_dict[(coliner_rec.name1,
                         coliner_rec.name2)] = coliner_rec
Example #2
0
def main(args):
    args = parse_args(args)
    f_coliner = args.coliner
    f_ppi = args.ppi
    f_string_ref = args.string_ref
    f_plant_ref = args.plant_ref
    f_out = args.output

    string_name_li = load_ref_name(f_string_ref)
    plant_name_li = load_ref_name(f_plant_ref)
    trans_map = defaultdict(set)
    for string_name, plant_name in iter_coliner_pair(f_coliner, string_name_li,
                                                     plant_name_li):
        trans_map[string_name].add(plant_name)

    with open(f_out, "w") as f:
        header = "ori_prot1\tori_prot2\tprot1\tprot2\tcombined_score\n"
        f.write(header)
        for indx, line in enumerate(iterline(f_ppi)):
            if indx == 0:
                continue
            ori_prot1, ori_prot2, score = line.rstrip().split(" ")
            for prot1 in sorted(trans_map[ori_prot1]):
                for prot2 in sorted(trans_map[ori_prot2]):
                    if prot1 == prot2:
                        continue
                    elif prot1 < prot2:
                        data = [ori_prot1, ori_prot2, prot1, prot2, score]
                    else:
                        data = [ori_prot2, ori_prot1, prot2, prot1, score]
                    f.write("\t".join(data) + "\n")
Example #3
0
 def load_blast(self, f_blast):
     for line in iterline(f_blast):
         data = line.rstrip().split("\t")
         cds1 = data[0].rstrip("_CDS")
         cds2 = data[1].rstrip("_CDS")
         if self.genome1.is_in_genome(cds1):
             if self.genome2.is_in_genome(cds2):
                 name1 = cds1
                 name2 = cds2
                 fwd = True
             else:
                 continue
         else:
             assert self.genome2.is_in_genome(cds1)
             if self.genome1.is_in_genome(cds2):
                 name1 = cds2
                 name2 = cds1
                 fwd = False
             else:
                 continue
         key = (name1, name2)
         if key not in self.pair_dict:
             blast_rec = BlastRec(name1, name2)
             blast_rec.find_uniq_id(self.genome1, self.genome2)
             self.pair_dict[key] = blast_rec
         self.pair_dict[key].set_pos(fwd)
Example #4
0
 def load_uniq_locus_map(self, f_uniq_locus_map):
     for indx, line in enumerate(iterline(f_uniq_locus_map)):
         if indx == 0:
             continue
         locus_id, assemble, tid = line.rstrip().split("\t")
         if assemble != self.assemble:
             continue
         self.uniq_locus_map[tid] = locus_id
Example #5
0
 def load_uniq_cds_map(self, f_uniq_cds_map):
     for indx, line in enumerate(iterline(f_uniq_cds_map)):
         if indx == 0:
             continue
         hash_id, assemble, cds_id = line.rstrip().split("\t")
         if assemble != self.assemble:
             continue
         cds_id = cds_id.rstrip("_CDS")
         self.uniq_cds_map[cds_id] = hash_id
Example #6
0
def iter_coliner_pair(f, string_li, plant_li):
    for line in iterline(f):
        data = line.rstrip().split("\t")
        name1 = data[1]
        name2 = data[2]
        if name1 in string_li:
            if name2 in plant_li:
                yield name1, name2
        else:
            assert name1 in plant_li
            if name2 in string_li:
                yield name2, name1
Example #7
0
def main(args):
    args = parse_args(args)
    f_in = args.input
    f_uniq_locus = args.uniq_locus
    assemble_li = args.assemble
    f_out_prot = args.out_prot
    f_out_locus = args.out_locus

    assert len(f_in) == len(assemble_li)
    print("Loading score file ...")
    score_dict = defaultdict(list)
    for assemble, file in zip(assemble_li, f_in):
        for prot1, prot2, score in iter_data(file):
            prot1 = prot1.rstrip("_CDS")
            prot2 = prot2.rstrip("_CDS")
            score_dict[(assemble, prot1, prot2)].append(score)

    print("Building locus map ...")
    locus_map = dict()
    for indx, line in enumerate(iterline(f_uniq_locus)):
        if indx == 0:
            continue
        hash_id, assemble, cds_id = line.rstrip().split("\t")
        cds_id = cds_id.rstrip("_CDS")
        locus_map[(assemble, cds_id)] = hash_id
    locus_score_dict = defaultdict(list)

    print("Writing prot interactrion ...")
    with open(f_out_prot, "w") as f:
        header = "Assemble\tProtein1\tProtein2\tcombined_score\n"
        f.write(header)
        for (assemble, prot1, prot2), score_li in sorted(score_dict.items()):
            score = np.mean(score_li)
            data = [assemble, prot1, prot2, "{0:.1f}".format(score)]
            f.write("\t".join(data)+"\n")
            locus1 = locus_map[(assemble, prot1)]
            locus2 = locus_map[(assemble, prot2)]
            locus_score_dict[(locus1, locus2)].append(score)

    print("Writing locus interactrion ...")
    with open(f_out_locus, "w") as f:
        header = "Locus1\tLocus2\tcombined_score\n"
        f.write(header)
        for (locus1, locus2), score_li in sorted(locus_score_dict.items()):
            score = np.mean(score_li)
            data = [locus1, locus2, "{0:.1f}".format(score)]
            f.write("\t".join(data) + "\n")
Example #8
0
def main(args):
    args = parse_args(args)
    f_in = args.input
    f_ref = args.ref
    f_output = args.output
    min_loop_len = args.min_loop_len
    min_rg4_len = args.min_rg4_len

    utr_bed = BedFile(f_ref, "r")
    utr_recs = dict()
    for rec in utr_bed.load("isoform"):
        utr_recs[rec.name] = rec
    res_rec = list()
    idx = 0
    for line in iterline(f_in):
        name, rg4_indx, sidx1, sidx2, sidx3, sidx4, rg4_len, score, seq = line.rstrip(
            "\n").split("\t")
        sidx1 = int(sidx1)
        sidx2 = int(sidx2)
        sidx3 = int(sidx3)
        sidx4 = int(sidx4)
        rg4_len = int(rg4_len)
        if rg4_len < min_rg4_len:
            continue
        if (sidx2 - sidx1 - rg4_len - min_loop_len) < 0:
            continue
        if (sidx3 - sidx2 - rg4_len - min_loop_len) < 0:
            continue
        if (sidx4 - sidx3 - rg4_len - min_loop_len) < 0:
            continue
        idx += 1
        rec = utr_recs[name]
        rg4_region = rec.slice(sidx1,
                               sidx4 + rg4_len,
                               name="{0}_rg4_{1}".format(name, idx),
                               strand=True)
        res_rec.append(rg4_region.trans("bed12"))
    out_bed = BedFile(f_output, "w")
    out_bed.write(res_rec)
Example #9
0
def main(args):
    args = parse_args(args)
    f_in = args.input
    f_out_genome = args.out_genome
    f_out_cds = args.out_cds
    f_ref = args.reference

    col_dict = {
        "outside": "200,50,50",
        "inside": "50,200,50",
        "TMhelix": "50,50,200"
    }
    cds_bed = BedFile(f_ref, "r")
    cds_dict = dict()
    for iso in cds_bed.load("isoform"):
        cds_dict[iso.name] = iso

    cds_li = list()
    with open(f_out_genome, "w") as f:
        for indx, line in enumerate(iterline(f_in)):
            if indx > 0:
                data = line.rstrip("\n").split()
                cds_name = data[0]
                pos = data[2]
                sidx = int(data[3]) - 1
                eidx = int(data[4])
                name = "{0}_{1}".format(cds_name, pos)
                cds_bed_rec = Bed6Record()
                cds_bed_rec.init_by_data(cds_name, sidx, eidx, pos, 0, ".")
                cds_li.append(cds_bed_rec)
                cds = cds_dict[cds_name]
                sub_cds = cds.slice(sidx * 3, eidx * 3).trans("bed12")
                sub_cds.score = 0
                sub_cds.itemRgb = col_dict[pos]
                sub_cds.name = name
                f.write(str(sub_cds) + "\n")
    cds_bed = BedFile(f_out_cds, "w")
    cds_bed.write(cds_li)
Example #10
0
def main(args):
    args = parse_args(args)
    f_in = args.input
    f_out = args.output
    f_ref = args.reference

    cds_bed = BedFile(f_ref, "r")
    cds_dict = dict()
    for iso in cds_bed.load("isoform"):
        cds_dict[iso.name] = iso
    with open(f_out, "w") as f:
        for line in iterline(f_in):
            if line:
                data = line.rstrip("\n").split()
                cds_name = data[0]
                sidx = int(data[1]) - 1
                eidx = int(data[2])
                name = "{0}_{1}:{2}({3})".format(cds_name, data[7], data[6],
                                                 data[5])
                cds = cds_dict[cds_name]
                sub_cds = cds.slice(sidx * 3, eidx * 3).trans("bed12")
                sub_cds.score = 0
                sub_cds.name = name
                f.write(str(sub_cds) + "\n")
Example #11
0
def iter_data(file):
    for indx, line in enumerate(iterline(file)):
        if indx == 0:
            continue
        ori_prot1, ori_prot2, prot1, prot2, combined_score = line.rstrip().split("\t")
        yield prot1, prot2, float(combined_score)
Example #12
0
def load_ref_name(f):
    name_li = list()
    for line in iterline(f):
        data = line.rstrip().split("\t")
        name_li.append(data[1])
    return name_li