Example #1
0
def chain_helper(
    ref_gff: Union[str, Path],
    ref_group: Union[str, Path],
    addon_gff: Union[str, Path],
    addon_group: Union[str, Path],
    name1: str,
    name2: str,
    fuzzy_junction: int,
    allow_5merge: bool,
    max_3_diff: int,
) -> None:
    o = sp.MegaPBTree(
        gff_filename=ref_gff,
        group_filename=ref_group,
        self_prefix=name1,
        internal_fuzzy_max_dist=fuzzy_junction,
        allow_5merge=allow_5merge,
        max_3_diff=max_3_diff,
        fastq_filename=None,
    )
    o.add_sample(
        gff_filename=addon_gff,
        group_filename=addon_group,
        sample_prefix=name2,
        output_prefix=f"tmp_{name2}",
        fastq_filename=None,
    )
Example #2
0
def chain_helper(ref_gff, ref_group, addon_gff, addon_group, name1, name2, fuzzy_junction, allow_5merge, max_3_diff):
    o = sp.MegaPBTree(ref_gff, ref_group, self_prefix=name1, \
                      internal_fuzzy_max_dist=fuzzy_junction, \
                      allow_5merge=allow_5merge, \
                      max_3_diff=max_3_diff, \
                      fastq_filename=None)
    o.add_sample(addon_gff, addon_group, \
                 sample_prefix=name2, output_prefix='tmp_' + name2, \
                 fastq_filename=None)
def chain_helper(ref_gff, ref_group, addon_gff, addon_group, name1, name2,
                 fuzzy_junction, allow_5merge, max_3_diff):
    #print("chain_helper called w: ref {0}, add on {1}, output: tmp_{2}".format(ref_gff, addon_gff, name2))
    o = sp.MegaPBTree(ref_gff, ref_group, self_prefix=name1, \
                      internal_fuzzy_max_dist=fuzzy_junction, \
                      allow_5merge=allow_5merge, \
                      max_3_diff=max_3_diff, \
                      fastq_filename=None)
    o.add_sample(addon_gff, addon_group, \
                 sample_prefix=name2, output_prefix='tmp_' + name2, \
                 fastq_filename=None)
def chain_samples(dirs,
                  names,
                  group_filename,
                  gff_filename,
                  count_filename,
                  field_to_use='norm_nfl',
                  fuzzy_junction=0,
                  allow_5merge=False,
                  fastq_filename=None):

    for d in dirs.itervalues():
        sample_sanity_check(os.path.join(d, group_filename),\
                            os.path.join(d, gff_filename),\
                            os.path.join(d, count_filename),\
                            os.path.join(d, fastq_filename) if fastq_filename is not None else None)

    count_header, count_info = read_count_info(count_filename, dirs,
                                               field_to_use)

    # some names may already start with "tmp_" which means they are intermediate results that have already been chained
    # find the first non "tmp_" and start from there
    if names[0].startswith('tmp_'):
        chain = []
        for start_i, name in enumerate(names):
            if name.startswith('tmp_'):
                chain.append(name[4:])
            else:
                break
        # start_i, name now points at the first "non-tmp" sample
        # we want to go to the last tmp_ sample and read it
        name = names[start_i -
                     1][4:]  # this is the last tmp_ sample, let's read it
        o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \
                        internal_fuzzy_max_dist=fuzzy_junction, \
                        allow_5merge=allow_5merge, \
                        fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None)
        #chain.append(name) # no need, already done above
    else:  # everything is new, start fresh
        name = names[0]
        d = dirs[name]
        chain = [name]
        o = sp.MegaPBTree(os.path.join(d, gff_filename), os.path.join(d, group_filename), \
                        self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, \
                        allow_5merge=allow_5merge, \
                        fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None)
        start_i = 1

    for name in names[start_i:]:
        assert not name.startswith('tmp_')
        d = dirs[name]
        o.add_sample(os.path.join(d, gff_filename), os.path.join(d, group_filename), \
                     sample_prefix=name, output_prefix='tmp_'+name, \
                     fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None)
        o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \
                          internal_fuzzy_max_dist=fuzzy_junction, \
                          allow_5merge=allow_5merge, \
                          fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None)
        chain.append(name)

    # now recursively chain back by looking at mega_info.txt!!!
    d = {}  # ex: (tmp_1009, PB.1.1) --> mega info dict
    for c in chain[1:]:
        for r in DictReader(open('tmp_' + c + '.mega_info.txt'),
                            delimiter='\t'):
            d['tmp_' + c, r['pbid']] = r

    f1 = open('all_samples.chained_ids.txt', 'w')
    f1.write("superPBID")
    f2 = open('all_samples.chained_count.txt', 'w')
    f2.write("superPBID")
    for c in chain:
        f1.write('\t' + c)
        f2.write('\t' + c)
    f1.write('\n')
    f2.write('\n')

    reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'),
                        delimiter='\t')
    for r in reader:
        saw_NA = False
        r0 = r
        answer = defaultdict(lambda: 'NA')  # ex: 1009 --> PB.1.1
        answer2 = defaultdict(lambda: 'NA')  # ex: 1009 --> count
        answer[chain[-1]] = r[chain[-1]]
        if r[chain[-1]] != 'NA':
            answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]]
        for c in chain[::-1][
                1:
                -1]:  # the first sample does not have tmp_, because it's not a chain
            if r['tmp_' + c] == 'NA':
                saw_NA = True
                break
            else:
                r2 = d['tmp_' + c, r['tmp_' + c]]
                answer[c] = r2[c]
                if answer[c] != 'NA':
                    answer2[c] = count_info[c, answer[c]]
                r = r2
        if not saw_NA:
            answer[chain[0]] = r[chain[0]]
            if answer[chain[0]] != 'NA':
                answer2[chain[0]] = count_info[chain[0], answer[chain[0]]]
        f1.write(r0['pbid'])
        f2.write(r0['pbid'])
        for c in chain:
            f1.write("\t" +
                     answer[c])  # each tissue still share the same PB id
            f2.write("\t" + str(answer2[c]))
        f1.write('\n')
        f2.write('\n')
    f1.close()
    f2.close()

    shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff')
    if fastq_filename is not None:
        shutil.copyfile('tmp_' + chain[-1] + '.rep.fq',
                        'all_samples.chained.rep.fq')

    print >> sys.stderr, "Chained output written to:"
    print >> sys.stderr, "all_samples.chained.gff"
    print >> sys.stderr, f1.name
    print >> sys.stderr, f2.name
    if fastq_filename is not None:
        print >> sys.stderr, "all_samples.chained.rep.fq"
Example #5
0
def chain_samples(dirs,
                  names,
                  group_filename,
                  gff_filename,
                  count_filename,
                  field_to_use='norm_nfl',
                  fuzzy_junction=0,
                  allow_5merge=False,
                  fastq_filename=None):

    for d in dirs.itervalues():
        sample_sanity_check(os.path.join(d, group_filename),\
                            os.path.join(d, gff_filename),\
                            os.path.join(d, count_filename),\
                            os.path.join(d, fastq_filename) if fastq_filename is not None else None)

    count_info = {}  # key: (sample, PB.1.1) --> count
    for name, d in dirs.iteritems():
        f = open(os.path.join(d, count_filename))
        while True:
            cur = f.tell()
            if not f.readline().startswith('#'): break
        f.seek(cur)
        for r in DictReader(f, delimiter='\t'):
            count_info[name, r['pbid']] = r[field_to_use]

    name = names[0]
    d = dirs[name]
    chain = [name]

    o = sp.MegaPBTree(os.path.join(d, gff_filename), os.path.join(d, group_filename), \
                      self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, \
                      allow_5merge=allow_5merge, \
                      fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None)
    for name in names[1:]:
        d = dirs[name]
        o.add_sample(os.path.join(d, gff_filename), os.path.join(d, group_filename), \
                     sample_prefix=name, output_prefix='tmp_'+name, \
                     fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None)
        o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \
                          internal_fuzzy_max_dist=fuzzy_junction, \
                          allow_5merge=allow_5merge, \
                          fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None)
        chain.append(name)

    # now recursively chain back by looking at mega_info.txt!!!
    d = {}  # ex: (tmp_1009, PB.1.1) --> mega info dict
    for c in chain[1:]:
        for r in DictReader(open('tmp_' + c + '.mega_info.txt'),
                            delimiter='\t'):
            d['tmp_' + c, r['pbid']] = r

    f1 = open('all_samples.chained_ids.txt', 'w')
    f1.write("superPBID")
    f2 = open('all_samples.chained_count.txt', 'w')
    f2.write("superPBID")
    for c in chain:
        f1.write('\t' + c)
        f2.write('\t' + c)
    f1.write('\n')
    f2.write('\n')

    reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'),
                        delimiter='\t')
    for r in reader:
        saw_NA = False
        r0 = r
        answer = defaultdict(lambda: 'NA')  # ex: 1009 --> PB.1.1
        answer2 = defaultdict(lambda: 'NA')  # ex: 1009 --> count
        answer[chain[-1]] = r[chain[-1]]
        if r[chain[-1]] != 'NA':
            answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]]
        for c in chain[::-1][
                1:
                -1]:  # the first sample does not have tmp_, because it's not a chain
            if r['tmp_' + c] == 'NA':
                saw_NA = True
                break
            else:
                r2 = d['tmp_' + c, r['tmp_' + c]]
                answer[c] = r2[c]
                if answer[c] != 'NA':
                    answer2[c] = count_info[c, answer[c]]
                r = r2
        if not saw_NA:
            answer[chain[0]] = r[chain[0]]
            if answer[chain[0]] != 'NA':
                answer2[chain[0]] = count_info[chain[0], answer[chain[0]]]
        f1.write(r0['pbid'])
        f2.write(r0['pbid'])
        for c in chain:
            f1.write("\t" +
                     answer[c])  # each tissue still share the same PB id
            f2.write("\t" + str(answer2[c]))
        f1.write('\n')
        f2.write('\n')
    f1.close()
    f2.close()

    shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff')
    if fastq_filename is not None:
        shutil.copyfile('tmp_' + chain[-1] + '.rep.fq',
                        'all_samples.chained.rep.fq')

    print >> sys.stderr, "Chained output written to:"
    print >> sys.stderr, "all_samples.chained.gff"
    print >> sys.stderr, f1.name
    print >> sys.stderr, f2.name
    if fastq_filename is not None:
        print >> sys.stderr, "all_samples.chained.rep.fq"
Example #6
0
def chain_samples(
    dirs,
    names,
    group_filename,
    gff_filename,
    count_filename,
    field_to_use="count_fl",
    fuzzy_junction=0,
    allow_5merge=False,
    max_3_diff=100,
    fastq_filename=None,
):
    for d in dirs.values():
        sample_sanity_check(
            Path(d, group_filename),
            Path(d, gff_filename),
            Path(d, count_filename),
            Path(d, fastq_filename) if fastq_filename is not None else None,
        )

    count_info = read_count_info(count_filename, dirs, field_to_use)

    # some names may already start with "tmp_" which means they are intermediate results that have already been chained
    # find the first non "tmp_" and start from there
    if names[0].startswith("tmp_"):
        chain = []
        for start_i, name in enumerate(names):
            if name.startswith("tmp_"):
                chain.append(name[4:])
            else:
                break
        # start_i, name now points at the first "non-tmp" sample
        # we want to go to the last tmp_ sample and read it
        name = names[start_i -
                     1][4:]  # this is the last tmp_ sample, let's read it
        o = sp.MegaPBTree(
            f"tmp_{name}.gff",
            f"tmp_{name}.group.txt",
            self_prefix=f"tmp_{name}",
            internal_fuzzy_max_dist=fuzzy_junction,
            allow_5merge=allow_5merge,
            max_3_diff=max_3_diff,
            fastq_filename=f"tmp_{name}.rep.fq"
            if fastq_filename is not None else None,
        )
        # chain.append(name) # no need, already done above
    else:  # everything is new, start fresh
        name = names[0]
        d = Path(dirs[name])
        chain = [name]
        o = sp.MegaPBTree(
            d.joinpath(gff_filename),
            d.joinpath(group_filename),
            self_prefix=name,
            internal_fuzzy_max_dist=fuzzy_junction,
            allow_5merge=allow_5merge,
            max_3_diff=max_3_diff,
            fastq_filename=d.joinpath(fastq_filename)
            if fastq_filename is not None else None,
        )
        start_i = 1

    for name in names[start_i:]:
        if name.startswith("tmp_"):
            raise AssertionError("trying to add a temp file!")
        d = Path(dirs[name])
        o.add_sample(
            d.joinpath(gff_filename),
            d.joinpath(group_filename),
            sample_prefix=name,
            output_prefix=f"tmp_{name}",
            fastq_filename=d.joinpath(fastq_filename)
            if fastq_filename is not None else None,
        )
        o = sp.MegaPBTree(
            f"tmp_{name}.gff",
            f"tmp_{name}.group.txt",
            self_prefix=f"tmp_{name}",
            internal_fuzzy_max_dist=fuzzy_junction,
            allow_5merge=allow_5merge,
            max_3_diff=max_3_diff,
            fastq_filename=f"tmp_{name}.rep.fq"
            if fastq_filename is not None else None,
        )
        chain.append(name)

    # now recursively chain back by looking at mega_info.txt!!!
    d = {}  # ex: (tmp_1009, PB.1.1) --> mega info dict
    for c in chain[1:]:
        for r in DictReader(open(f"tmp_{c}.mega_info.txt"), delimiter="\t"):
            d[f"tmp_{c}", r["superPBID"]] = r

    with open("all_samples.chained_ids.txt",
              "w") as f1, open("all_samples.chained_count.txt", "w") as f2:
        writer1 = DictWriter(f1,
                             fieldnames=["superPBID"] + chain,
                             delimiter="\t")
        writer1.writeheader()

        writer2 = DictWriter(f2,
                             fieldnames=["superPBID"] + chain,
                             delimiter="\t")
        writer2.writeheader()

        reader = DictReader(open(f"tmp_{chain[-1]}.mega_info.txt"),
                            delimiter="\t")
        for r in reader:
            saw_NA = False
            r0 = r
            answer = defaultdict(lambda: "NA")  # ex: 1009 --> PB.1.1
            answer2 = defaultdict(lambda: "NA")  # ex: 1009 --> count
            answer[chain[-1]] = r[chain[-1]]
            if r[chain[-1]] != "NA":
                answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]]
            for c in chain[::-1][
                    1:
                    -1]:  # the first sample does not have tmp_, because it's not a chain
                if r[f"tmp_{c}"] == "NA":
                    saw_NA = True
                    break
                else:
                    r2 = d[f"tmp_{c}", r[f"tmp_{c}"]]
                    answer[c] = r2[c]
                    if answer[c] != "NA":
                        answer2[c] = count_info[c, answer[c]]
                    r = r2
            if not saw_NA:
                answer[chain[0]] = r[chain[0]]
                if answer[chain[0]] != "NA":
                    answer2[chain[0]] = count_info[chain[0], answer[chain[0]]]

            rec1 = {"superPBID": r0["superPBID"]}
            rec2 = {"superPBID": r0["superPBID"]}
            for c in chain:
                rec1[c] = answer[c]
                rec2[c] = str(answer2[c])
            writer1.writerow(rec1)
            writer2.writerow(rec2)

    shutil.copyfile(f"tmp_{chain[-1]}.gff", "all_samples.chained.gff")
    if fastq_filename is not None:
        shutil.copyfile(f"tmp_{chain[-1]}.rep.fq",
                        "all_samples.chained.rep.fq")

    logger.info("Chained output written to:")
    logger.info("all_samples.chained.gff")
    logger.info(f1.name)
    logger.info(f2.name)
    if fastq_filename is not None:
        logger.info("all_samples.chained.rep.fq")