Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description='fixname - Fix hit name in the blastlist')
    parser.add_argument('input_file')
    parser.add_argument('-o', '--output', dest='output_file',
                        help='output file name. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    args = parser.parse_args()

    if args.output_file is None:
        args.output_file = args.input_file + '_out_' + name.genid() + '.fix'

    hitname = re.compile('.*?(gi\|\d*?\|.*?\|.*?\|)(.*)')

    with open(args.input_file, 'r') as fin, open(args.output_file, 'w') as fw:
        for linum, line in enumerate(fin, start=1):
            if line.lstrip() == '' or line.lstrip()[0] in ('#', 'a'):
                fw.write(line)
                fw.flush()
            else:
                data = line.split('\t')
                match = hitname.match(data[26])

                if match is None:
                    print('No mathced name in line ' + str(linum) + '.')
                    print('Please have a check.')
                    sys.exit()
                else:
                    data[4] = match.group(1)
                    data[26] = match.group(1) + match.group(2) + '\n'
                    fw.write('\t'.join(data))
                    fw.flush()
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description='fa2lens - Extract length data from a fasta file')
    parser.add_argument('input_file')
    parser.add_argument('-s', '--sep', dest='sep', default='\n',
                        help='seperator (default: newline)')
    parser.add_argument('-o', '--output', dest='output_file',
                        help='output file name. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    args = parser.parse_args()

    if args.output_file is None:
        args.output_file = args.input_file + '_out_' + name.genid() + '.leng.txt'

    with open(args.input_file, 'r') as fin, open(args.output_file, 'w') as fw:
        records = map(str, map(len, list(SeqIO.parse(fin, 'fasta'))))
        fw.write(args.sep.join(records))
        fw.flush()
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description='blastnol - Find the non-overlapping hits in the blast result')
    parser.add_argument('input_file')
    parser.add_argument('-o', '--output-directory', dest='output_dir',
                        help='output directory name. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    parser.add_argument('-q', '--query-sequence', dest='query_fa',
                        help='fasta file of query sequence. If this option is specified, the script will '
                        'generate a new fasta file that contains truncated long sequences.')
    args = parser.parse_args()

    if args.output_dir is None:
        args.output_dir = args.input_file + '_out_' + name.genid()
    else:
        args.output_dir = args.output_dir.rstrip('/')

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.query_fa is not None:
        query_fa = dict(SeqIO.index(args.query_fa, 'fasta'))
        fw_fa = open(args.output_dir + '/truncated.fa', 'w')

    with open(args.output_dir + '/sort.temp', 'w') as fwsort:
        awk_cmd = "awk -F'\t' 'int($1) { print $0 }' " + args.input_file
        sort_cmd = "sort -t$'\t' -k9g,9 -k4d,4 -k18g,18 -k22gr,22 -k19gr,19 -k26gr,26 -k6gr"
        awk_proc = Popen(awk_cmd, stdout=PIPE, executable='/bin/bash', shell=True)
        sort_proc = Popen(sort_cmd, stdin=awk_proc.stdout, stdout=fwsort, executable='/bin/bash', shell=True)
        sort_proc.communicate()

    seq = {}

    with open(args.output_dir + '/sort.temp', 'r') as fi:
        for line in fi:
            data = line.split('\t')
            query_name = data[3]
            hit_name = data[4]
            query_strand = int(data[8])

            if query_strand < 0:
                query_name = '-' + query_name
                query_hsp_start = int(data[7])
                query_hsp_end = int(data[6])
            else:
                query_hsp_start = int(data[6])
                query_hsp_end = int(data[7])

            if query_name in seq:
                for i in range(len(seq[query_name])):
                    if hit_name in seq[query_name][i][0]:
                        seq[query_name][i][1].append((query_hsp_start, query_hsp_end, [line]))
                        break
                else:
                    seq[query_name].append((hit_name, [(query_hsp_start, query_hsp_end, [line])]))
            else:
                seq.update({query_name: [(hit_name, [(query_hsp_start, query_hsp_end, [line])])]})

    # Combine hsps
    for query_name in seq:
        hit_rank = 1
        for i in range(len(seq[query_name])):
            hit = seq[query_name][i]
            if len(hit[1]) > 1:
                # Combine hsps
                pos_start, pos_end, lines = combine_hsps(hit[1])
                seq[query_name][i] = ([hit[0]], [(pos_start, pos_end)], [lines], [hit_rank])
            else:
                seq[query_name][i] = ([hit[0]], [(hit[1][0][0], hit[1][0][1])], [hit[1][0][2]], [hit_rank])
            hit_rank += 1

    # Sort hits by start position
    for query_name in seq:
        seq[query_name] = sorted(seq[query_name], key=get_start_pos)

    # Check overlap
    for query_name, hits in seq.items():
        while len(hits) > 1:
            position = calculate.get_non_overlap((hits[0][1][0][0], hits[0][1][-1][1]), (hits[1][1][0][0], hits[1][1][0][1]))
            if position is not None:
                # The two sequences are non-overlapping, combine them
                seq[query_name][0] = (hits[0][0] + hits[1][0], hits[0][1] + hits[1][1], hits[0][2] + hits[1][2], hits[0][3] + hits[1][3])
                seq[query_name].pop(1)
            else:
                # Compare the hit rank to determine which one is retained
                if seq[query_name][0][3][-1] < seq[query_name][1][3][0]:
                    # Discard the next hit
                    seq[query_name].pop(1)
                else:
                    # Discard the last hit, and join the next hit
                    seq[query_name][0][0].pop(-1)
                    seq[query_name][0][1].pop(-1)
                    seq[query_name][0][2].pop(-1)
                    seq[query_name][0][3].pop(-1)
                    seq[query_name][0] = (hits[0][0] + hits[1][0], hits[0][1] + hits[1][1], hits[0][2] + hits[1][2], hits[0][3] + hits[1][3])
                    seq[query_name].pop(1)

    # Write data
    with open(args.output_dir + '/hit_cover.tsv', 'w') as fw:
        query_num = 0
        query_num_cover_eq_two = 0
        query_num_cover_eq_three = 0
        query_num_cover_ge_four = 0

        hit_set = set()
        hr = header.blastlist()
        fw.write(hr.get_all_tab() + '\n')
        fw.flush()

        for query, hits in seq.items():
            query = query.split(' ')[0]

            if len(hits[0][0]) > 1:
                query_num += 1

                if len(hits[0][0]) == 2:
                    query_num_cover_eq_two += 1
                elif len(hits[0][0]) == 3:
                    query_num_cover_eq_three += 1
                else:
                    query_num_cover_ge_four += 1

                for lines in hits[0][2]:
                    for line in lines:
                        hit_set.add(line.split('\t')[4])
                        fw.write(line)
                    fw.flush()

                if args.query_fa is not None:
                    # Truncated queries
                    segment_num = 0
                    for pos_start, pos_end in hits[0][1]:
                        fw_fa.write('>' + query + '_s' + str(segment_num) + '\n')
                        fw_fa.write(query_fa[query].seq.tostring()[pos_start - 1:pos_end] + '\n')
                        fw_fa.flush()
                        segment_num += 1
                    query_fa.pop(query)
            else:
                if args.query_fa is not None:
                    # Full-sequece queries
                    fw_fa.write('>' + query + '\n')
                    fw_fa.write(query_fa[query].seq.tostring() + '\n')
                    query_fa.pop(query)

        if args.query_fa is not None:
            # No-hit queries
            for query in query_fa:
                fw_fa.write('>' + query + '\n')
                fw_fa.write(query_fa[query].seq.tostring() + '\n')
                fw_fa.flush()
            fw_fa.close()

        fw.write('\n')
        fw.write('# Number of queries that cover >= 2 hits: ' + str(query_num) + '\n')
        fw.write('#   Cover 2 hits: ' + str(query_num_cover_eq_two) + '\n')
        fw.write('#   Cover 3 hits: ' + str(query_num_cover_eq_three) + '\n')
        fw.write('#   Cover >= 4 hits: ' + str(query_num_cover_ge_four) + '\n')
        fw.write('# Number of covered hits: ' + str(len(hit_set)))
Ejemplo n.º 4
0
def main():
    proglog = logmsg.message(prog='fetchfa', cmd=' '.join(sys.argv))

    parser = argparse.ArgumentParser(description='fetchfa - Fetch fasta files from Entrez')
    parser.add_argument('input_file', nargs='?')
    parser.add_argument('-d', '--db', dest='database', default='protein',
                        help='database (default: protein)')
    parser.add_argument('-q', '--query', dest='query_id',
                        help='accessions to be fetched. If this option is specifid, the script will use the values '
                        'to fetch data, and no input file is required to be handled.')
    parser.add_argument('-o', '--output', dest='output', default='fetchfa_out_' + name.genid(),
                        help='output directory or file name. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    parser.add_argument('-l', '--log', dest='log_file',
                        help='log file name')
    args = parser.parse_args()

    if args.log_file is None:
        fwlog = open(args.output + '.log', 'w')
    else:
        fwlog = open(args.log_file, 'w')

    for i in proglog.start_message():
        fwlog.write(i)
    fwlog.flush()

    Entrez.email = name.genid() + '@example.com'

    if args.query_id is not None:
        with open(args.output + '.fa', 'w') as fw, open(args.output + '.log', 'w') as fwlog:
            handle = Entrez.efetch(db=args.database,
                                   id=args.query_id,
                                   rettype='fasta',
                                   retmode='text')

            fw.write(handle.read())
            fw.flush()

            fwlog.write('# Fetched sequences: ' + str(len(args.query_id.split(','))) + '\n')
            fwlog.write('#\n')

            for i in proglog.end_message():
                fwlog.write(i)
            fwlog.flush()
    else:
        if not os.path.exists(args.output):
            os.makedirs(args.output)

        with open(args.input_file, 'r') as fin:
            query_num = 0
            for line in fin:
                if line.lstrip() == '' or line.lstrip()[0] in ('#', 'a'):
                    continue

                query_num += 1

                with open(os.path.abspath(args.output) + '/' + line.split('\t')[0] + '.fa', 'w') as fw:
                    alist = line.rstrip().split('\t')[1].split(',')

                    while len(alist) > 30:
                        alist_part = alist[0:30]
                        alist = alist[30:len(alist)]

                        handle = Entrez.efetch(db=args.database,
                                               id=','.join(alist_part),
                                               rettype='fasta',
                                               retmode='text')
                        fw.write(handle.read())
                        fw.flush()
                        handle.close()

                    handle = Entrez.efetch(db=args.database,
                                           id=','.join(alist),
                                           rettype='fasta',
                                           retmode='text')
                    fw.write(handle.read())
                    fw.flush()
                    handle.close()

            fwlog.write('# Fetched queries: ' + str(query_num) + '\n')
            fwlog.write('#\n')

            for i in proglog.end_message():
                fwlog.write(i)
            fwlog.flush()

    fwlog.close()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='commutate - Find the common mutation profile')
    parser.add_argument('input', nargs='*')
    parser.add_argument('-o', '--output-directory', dest='output', default='commutate_out_' + name.genid(),
                        help='output directory. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    args = parser.parse_args()

    args.output = args.output.rstrip('/')

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    source = []
    source_res_eq_susp = {}
    source_rec_eq_susp = {}
    source_rec_eq_res = {}

    for afile in args.input:
        source.append(afile)
        source_res_eq_susp[afile] = {}
        source_rec_eq_susp[afile] = {}
        source_rec_eq_res[afile] = {}

        with open(afile, 'r') as fin:
            for line in fin:
                if line.lstrip() == '' or line.lstrip()[0] in ('#', 'm'):
                    continue
                data = line.rstrip().split('\t')
                if int(data[3]) > 0:
                    source_res_eq_susp[afile].update({data[1]: set(data[9].split(','))})
                if int(data[4]) > 0:
                    source_rec_eq_susp[afile].update({data[1]: set(data[10].split(','))})
                if int(data[5]) > 0:
                    source_rec_eq_res[afile].update({data[1]: set(data[11].split(','))})

    common_hitname_res_eq_susp = get_common_hitname(source_res_eq_susp)
    common_mutation_profile_res_eq_susp = get_common_mutate(source_res_eq_susp, common_hitname_res_eq_susp)

    common_hitname_rec_eq_susp = get_common_hitname(source_rec_eq_susp)
    common_mutation_profile_rec_eq_susp = get_common_mutate(source_rec_eq_susp, common_hitname_rec_eq_susp)

    common_hitname_rec_eq_res = get_common_hitname(source_rec_eq_res)
    common_mutation_profile_rec_eq_res = get_common_mutate(source_rec_eq_res, common_hitname_rec_eq_res)

    writefile(args.output + '/common_mutation_profile_res_eq_susp.txt', common_mutation_profile_res_eq_susp)
    writefile(args.output + '/common_mutation_profile_rec_eq_susp.txt', common_mutation_profile_rec_eq_susp)
    writefile(args.output + '/common_mutation_profile_rec_eq_res.txt', common_mutation_profile_rec_eq_res)
Ejemplo n.º 6
0
def main():
    proglog = logmsg.message(prog='blast2accmap', cmd=' '.join(sys.argv))

    parser = argparse.ArgumentParser(description='blast2accmap - Extract names of query and hit sequences')
    parser.add_argument('input_file')
    parser.add_argument('-e', '--evalue', dest='ev_thresh', type=float, default=0.01,
                        help='evalue thresh (default: 0.01)')
    parser.add_argument('-t', '--min_hit_num', dest='min_hit_num', type=int, default=1,
                        help='minimum number of hit sequences (default: 1)')
    parser.add_argument('-o', '--output', dest='output_file',
                        help='output file name. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    args = parser.parse_args()

    if args.output_file is None:
        args.output_file = args.input_file + '_out_' + name.genid() + 'blastaccmap'

    total_query_num = 0
    parsed_query_num = 0

    with open(args.input_file, 'r') as result_handle, open(args.output_file, 'w') as fw:
        blast_records = NCBIXML.parse(result_handle)

        for i in proglog.start_message():
            fw.write(i)

        fw.write('#\n')
        fw.write('# E-value threshold: ' + str(args.ev_thresh) + '\n')
        fw.write('# min hit number: ' + str(args.min_hit_num) + '\n')
        fw.write('#\n')
        fw.write('# filename    query_accession,hit_accession_1,hit_accession_2, ...\n\n')
        fw.flush()

        gi = re.compile('gi\|(\d+)\|')

        for blast_record in blast_records:
            total_query_num += 1

            if len(blast_record.alignments) < args.min_hit_num:
                continue

            hit_accs = []

            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if alignment.accession in blast_record.query:
                        """If query hit itself, ignore it. """
                        continue

                    if hsp.expect <= args.ev_thresh:
                        match = gi.match(alignment.hit_id).group(1)

                        if match is None:
                            print(alignment.accession + ' does not have gi.')
                            hit_accs.append(alignment.accession)
                        else:
                            hit_accs.append(match)
                        break

            if len(hit_accs) >= args.min_hit_num:
                parsed_query_num += 1
                fw.write(blast_record.query + '\t' + blast_record.query + ',')
                fw.write(','.join(hit_accs) + '\n')
                fw.flush()

        fw.write('\n')
        fw.write('# Total queries: ' + str(total_query_num) + '\n')
        fw.write('# Parsed queries: ' + str(parsed_query_num) + '\n')
        fw.write('#\n')

        for i in proglog.end_message():
            fw.write(i)

        fw.flush()
Ejemplo n.º 7
0
def main():
    proglog = logmsg.message(prog='commonfa', cmd=' '.join(sys.argv))

    parser = argparse.ArgumentParser(description='commonfa - Generate fasta files of sequences with common hit')
    parser.add_argument('-b', '--blastlist', dest='input_files_blastlist', nargs='*', required=True,
                        help='blastlist files (required)')
    parser.add_argument('-f', '--fasta', nargs='*', dest='input_files_fasta', required=True,
                        help='fasta files (required)')
    parser.add_argument('-o', '--output-directory', dest='output', default='commonfa_out_' + name.genid(),
                        help='output directory. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    parser.add_argument('-p', '--process', dest='process_num', type=int, default=1,
                        help='number of threads (CPUs) to use')
    args = parser.parse_args()

    config = ConfigParser.ConfigParser()
    config.read(os.path.dirname(os.path.abspath(__file__)) + '/config/group.cfg')

    if not os.path.exists(args.output.rstrip('/') + '/msainput'):
        os.makedirs(args.output.rstrip('/') + '/msainput')

    fwlog = open(args.output.rstrip('.') + '/commonfa.log', 'w')

    for i in proglog.start_message():
        fwlog.write(i)

    fwlog.flush()

    awk_cmd = "awk -F'\t' '$5 ~ /ref/ { print $0 }' " + ' '.join(args.input_files_blastlist)
    sort_cmd = "sort -t$'\t' -k5d,5 -k18g,18 -k22gr,22 -k19gr,19 -k26gr,26 -k6gr"

    fwsort = open(args.output.rstrip('/') + '/sort.temp', 'w')
    awk_proc = Popen(awk_cmd, stdout=PIPE, executable='/bin/bash', shell=True)
    sort_proc = Popen(sort_cmd, stdin=awk_proc.stdout, stdout=fwsort, executable='/bin/bash', shell=True)
    sort_proc.communicate()
    fwsort.close()

    fasta = {}

    for filename in args.input_files_fasta:
        fasta.update(dict(SeqIO.index(filename, 'fasta')))

    susp_names = config.get('Susp', 'bdor').split(',')
    res_names = config.get('Res', 'bdor').split(',')
    rec_names = config.get('Rec', 'bdor').split(',')
    has_susp = has_res = has_rec = False
    commonhit = {}

    hitname = re.compile('.*gi\|\d*?\|(.*?)\|(.*?)\|.*')

    with open(args.output.rstrip('/') + '/sort.temp', 'r') as fin:
        for line in fin:
            data = line.split('\t')
            match = hitname.match(data[4])

            query_name = data[3]
            hit_name = match.group(2)
            query_frame = int(data[9])

            if hit_name in commonhit:
                if any(i in query_name for i in susp_names):
                    if has_susp is True:
                        continue
                    else:
                        has_susp = True

                if any(i in query_name for i in res_names):
                    if has_res is True:
                        continue
                    else:
                        has_res = True

                if any(i in query_name for i in rec_names):
                    if has_rec is True:
                        continue
                    else:
                        has_rec = True

                commonhit[hit_name].append((query_name, query_frame))
            else:
                commonhit[hit_name] = [(query_name, query_frame)]
                has_susp = has_res = has_rec = False

                if any(i in query_name for i in susp_names):
                    has_susp = True

                if any(i in query_name for i in res_names):
                    has_res = True

                if any(i in query_name for i in rec_names):
                    has_rec = True

    tasks = []
    parsed_num = 0

    for hit in commonhit:
        if len(commonhit[hit]) == len(args.input_files_blastlist):
            tasks.append((hit, commonhit[hit], fasta, args))
            parsed_num += 1

    pool = Pool(processes=args.process_num)
    pool.map(do_parsing, tasks)

    fwlog.write('# Parsed hits: ' + str(parsed_num) + '\n')

    for i in proglog.end_message():
        fwlog.write(i)

    fwlog.flush()