Esempio n. 1
0
def main():
    # Check version
    if sys.version < '3.3':
        your_version = sys.version.split(' ')[0]
        print('* Your Python version (%s) is too old! Please upgrade to 3.3+!' % your_version)
        sys.exit()

    proglog = logmsg.message(prog='msaparser', cmd=' '.join(sys.argv))

    options, opt_others = config.get_configuration(os.path.dirname(os.path.abspath(__file__)))
    options.output_directory = options.output_directory.rstrip('/')
    options.source_directory = options.source_directory.rstrip('/')

    if not os.path.exists(options.output_directory + '/html'):
        os.makedirs(options.output_directory + '/html')

    mainfile = options.output_directory + '/' + opt_others.get('output_files').get('main')

    with open(mainfile, 'w') as fw:
        for msg in proglog.start_message():
            fw.write(msg)
        fw.write('\n')
        fw.flush()
        parser.writeheader(fw)

    cluinput = msaio.FileInput(options.source_directory)

    proc_manager = Manager()
    q_write = proc_manager.Queue()
    proc = pool.Pool(processes=options.process_num)

    while cluinput.files:
        if len(cluinput.files) / 100 >= 1:
            files = cluinput.files[0:100]
            cluinput.files = cluinput.files[100:]
        else:
            files = cluinput.files[0:len(cluinput.files)]
            cluinput.files = cluinput.files[len(cluinput.files):]

        tasks = []

        for root, filename in files:
            tasks.append((root, filename, options, opt_others, q_write))

        proc.starmap(begin_parse, tasks)
        write_result(q_write, options.output_directory, mainfile)

    with open(mainfile, 'a') as fw:
        fw.write('\n')
        for msg in proglog.end_message():
            fw.write(msg)
        fw.flush()
Esempio n. 2
0
def main():
    proglog = logmsg.message(prog='fetchfa', cmd=' '.join(sys.argv))

    parser = argparse.ArgumentParser(description='fetchfa - Fetch fasta files from Entrez')
    parser.add_argument('input_file', nargs='?')
    parser.add_argument('-d', '--db', dest='database', default='protein',
                        help='database (default: protein)')
    parser.add_argument('-q', '--query', dest='query_id',
                        help='accessions to be fetched. If this option is specifid, the script will use the values '
                        'to fetch data, and no input file is required to be handled.')
    parser.add_argument('-o', '--output', dest='output', default='fetchfa_out_' + name.genid(),
                        help='output directory or file name. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    parser.add_argument('-l', '--log', dest='log_file',
                        help='log file name')
    args = parser.parse_args()

    if args.log_file is None:
        fwlog = open(args.output + '.log', 'w')
    else:
        fwlog = open(args.log_file, 'w')

    for i in proglog.start_message():
        fwlog.write(i)
    fwlog.flush()

    Entrez.email = name.genid() + '@example.com'

    if args.query_id is not None:
        with open(args.output + '.fa', 'w') as fw, open(args.output + '.log', 'w') as fwlog:
            handle = Entrez.efetch(db=args.database,
                                   id=args.query_id,
                                   rettype='fasta',
                                   retmode='text')

            fw.write(handle.read())
            fw.flush()

            fwlog.write('# Fetched sequences: ' + str(len(args.query_id.split(','))) + '\n')
            fwlog.write('#\n')

            for i in proglog.end_message():
                fwlog.write(i)
            fwlog.flush()
    else:
        if not os.path.exists(args.output):
            os.makedirs(args.output)

        with open(args.input_file, 'r') as fin:
            query_num = 0
            for line in fin:
                if line.lstrip() == '' or line.lstrip()[0] in ('#', 'a'):
                    continue

                query_num += 1

                with open(os.path.abspath(args.output) + '/' + line.split('\t')[0] + '.fa', 'w') as fw:
                    alist = line.rstrip().split('\t')[1].split(',')

                    while len(alist) > 30:
                        alist_part = alist[0:30]
                        alist = alist[30:len(alist)]

                        handle = Entrez.efetch(db=args.database,
                                               id=','.join(alist_part),
                                               rettype='fasta',
                                               retmode='text')
                        fw.write(handle.read())
                        fw.flush()
                        handle.close()

                    handle = Entrez.efetch(db=args.database,
                                           id=','.join(alist),
                                           rettype='fasta',
                                           retmode='text')
                    fw.write(handle.read())
                    fw.flush()
                    handle.close()

            fwlog.write('# Fetched queries: ' + str(query_num) + '\n')
            fwlog.write('#\n')

            for i in proglog.end_message():
                fwlog.write(i)
            fwlog.flush()

    fwlog.close()
Esempio n. 3
0
def main():
    proglog = logmsg.message(prog='blast2accmap', cmd=' '.join(sys.argv))

    parser = argparse.ArgumentParser(description='blast2accmap - Extract names of query and hit sequences')
    parser.add_argument('input_file')
    parser.add_argument('-e', '--evalue', dest='ev_thresh', type=float, default=0.01,
                        help='evalue thresh (default: 0.01)')
    parser.add_argument('-t', '--min_hit_num', dest='min_hit_num', type=int, default=1,
                        help='minimum number of hit sequences (default: 1)')
    parser.add_argument('-o', '--output', dest='output_file',
                        help='output file name. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    args = parser.parse_args()

    if args.output_file is None:
        args.output_file = args.input_file + '_out_' + name.genid() + 'blastaccmap'

    total_query_num = 0
    parsed_query_num = 0

    with open(args.input_file, 'r') as result_handle, open(args.output_file, 'w') as fw:
        blast_records = NCBIXML.parse(result_handle)

        for i in proglog.start_message():
            fw.write(i)

        fw.write('#\n')
        fw.write('# E-value threshold: ' + str(args.ev_thresh) + '\n')
        fw.write('# min hit number: ' + str(args.min_hit_num) + '\n')
        fw.write('#\n')
        fw.write('# filename    query_accession,hit_accession_1,hit_accession_2, ...\n\n')
        fw.flush()

        gi = re.compile('gi\|(\d+)\|')

        for blast_record in blast_records:
            total_query_num += 1

            if len(blast_record.alignments) < args.min_hit_num:
                continue

            hit_accs = []

            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if alignment.accession in blast_record.query:
                        """If query hit itself, ignore it. """
                        continue

                    if hsp.expect <= args.ev_thresh:
                        match = gi.match(alignment.hit_id).group(1)

                        if match is None:
                            print(alignment.accession + ' does not have gi.')
                            hit_accs.append(alignment.accession)
                        else:
                            hit_accs.append(match)
                        break

            if len(hit_accs) >= args.min_hit_num:
                parsed_query_num += 1
                fw.write(blast_record.query + '\t' + blast_record.query + ',')
                fw.write(','.join(hit_accs) + '\n')
                fw.flush()

        fw.write('\n')
        fw.write('# Total queries: ' + str(total_query_num) + '\n')
        fw.write('# Parsed queries: ' + str(parsed_query_num) + '\n')
        fw.write('#\n')

        for i in proglog.end_message():
            fw.write(i)

        fw.flush()
Esempio n. 4
0
def main():
    proglog = logmsg.message(prog='commonfa', cmd=' '.join(sys.argv))

    parser = argparse.ArgumentParser(description='commonfa - Generate fasta files of sequences with common hit')
    parser.add_argument('-b', '--blastlist', dest='input_files_blastlist', nargs='*', required=True,
                        help='blastlist files (required)')
    parser.add_argument('-f', '--fasta', nargs='*', dest='input_files_fasta', required=True,
                        help='fasta files (required)')
    parser.add_argument('-o', '--output-directory', dest='output', default='commonfa_out_' + name.genid(),
                        help='output directory. If this option is not specified, the script will generate '
                        'one with unique identifier at current directory.')
    parser.add_argument('-p', '--process', dest='process_num', type=int, default=1,
                        help='number of threads (CPUs) to use')
    args = parser.parse_args()

    config = ConfigParser.ConfigParser()
    config.read(os.path.dirname(os.path.abspath(__file__)) + '/config/group.cfg')

    if not os.path.exists(args.output.rstrip('/') + '/msainput'):
        os.makedirs(args.output.rstrip('/') + '/msainput')

    fwlog = open(args.output.rstrip('.') + '/commonfa.log', 'w')

    for i in proglog.start_message():
        fwlog.write(i)

    fwlog.flush()

    awk_cmd = "awk -F'\t' '$5 ~ /ref/ { print $0 }' " + ' '.join(args.input_files_blastlist)
    sort_cmd = "sort -t$'\t' -k5d,5 -k18g,18 -k22gr,22 -k19gr,19 -k26gr,26 -k6gr"

    fwsort = open(args.output.rstrip('/') + '/sort.temp', 'w')
    awk_proc = Popen(awk_cmd, stdout=PIPE, executable='/bin/bash', shell=True)
    sort_proc = Popen(sort_cmd, stdin=awk_proc.stdout, stdout=fwsort, executable='/bin/bash', shell=True)
    sort_proc.communicate()
    fwsort.close()

    fasta = {}

    for filename in args.input_files_fasta:
        fasta.update(dict(SeqIO.index(filename, 'fasta')))

    susp_names = config.get('Susp', 'bdor').split(',')
    res_names = config.get('Res', 'bdor').split(',')
    rec_names = config.get('Rec', 'bdor').split(',')
    has_susp = has_res = has_rec = False
    commonhit = {}

    hitname = re.compile('.*gi\|\d*?\|(.*?)\|(.*?)\|.*')

    with open(args.output.rstrip('/') + '/sort.temp', 'r') as fin:
        for line in fin:
            data = line.split('\t')
            match = hitname.match(data[4])

            query_name = data[3]
            hit_name = match.group(2)
            query_frame = int(data[9])

            if hit_name in commonhit:
                if any(i in query_name for i in susp_names):
                    if has_susp is True:
                        continue
                    else:
                        has_susp = True

                if any(i in query_name for i in res_names):
                    if has_res is True:
                        continue
                    else:
                        has_res = True

                if any(i in query_name for i in rec_names):
                    if has_rec is True:
                        continue
                    else:
                        has_rec = True

                commonhit[hit_name].append((query_name, query_frame))
            else:
                commonhit[hit_name] = [(query_name, query_frame)]
                has_susp = has_res = has_rec = False

                if any(i in query_name for i in susp_names):
                    has_susp = True

                if any(i in query_name for i in res_names):
                    has_res = True

                if any(i in query_name for i in rec_names):
                    has_rec = True

    tasks = []
    parsed_num = 0

    for hit in commonhit:
        if len(commonhit[hit]) == len(args.input_files_blastlist):
            tasks.append((hit, commonhit[hit], fasta, args))
            parsed_num += 1

    pool = Pool(processes=args.process_num)
    pool.map(do_parsing, tasks)

    fwlog.write('# Parsed hits: ' + str(parsed_num) + '\n')

    for i in proglog.end_message():
        fwlog.write(i)

    fwlog.flush()