Esempio n. 1
0
def do_run(args, data_path, match_score, vfs_list, cons_run):
    """
    Perform a SeqFindr run
    """
    matrix, y_label, exist_ord = [], [], []
    in_files = util.get_fasta_files(data_path)
    # Reorder if requested
    if args.index_file is not None:
        in_files = util.order_inputs(args.index_file, in_files)
    # Handle and existing run
    if args.existing_data is not None:
        cleaned, blastxml_tmp = [], []
        blast_xml = glob.glob(
            os.path.abspath(args.existing_data) + "/BLAST_results/*")
        if not cons_run:
            for e in blast_xml:
                if e.find("cons_DB=") == -1:
                    blastxml_tmp.append(e)
        else:
            for e in blast_xml:
                if e.find("cons_DB=") != -1:
                    blastxml_tmp.append(e)
        blast_xml = []
        blast_xml = blastxml_tmp
        for e in blast_xml:
            sid = e.split("ID=")[-1].split("_blast.xml")[0]
            cleaned.append(sid)
        for i in in_files:
            for j in cleaned:
                if i.find(j) != -1:
                    y_label.append(j)
                    break
        for i in blast_xml:
            for j in y_label:
                if i.find(j) != -1:
                    exist_ord.append(i)
                    break
        in_files = exist_ord
    # Make sure XML are right order
    for idx, subject in enumerate(in_files):
        if args.existing_data is None:
            strain_id = blast.make_BLAST_database(subject)
            y_label.append(strain_id)
            database = os.path.basename(subject)
            blast_xml = blast.run_BLAST(
                args.seqs_of_interest,
                os.path.join(os.getcwd(), "DBs/" + database), args, cons_run)
            accepted_hits = blast.parse_BLAST(blast_xml, float(args.tol),
                                              args.careful)
        else:
            strain_id = y_label[idx]
            accepted_hits = blast.parse_BLAST(in_files[idx], float(args.tol),
                                              args.careful)
        row = build_matrix_row(vfs_list, accepted_hits, match_score)
        row.insert(0, strain_id)
        matrix.append(row)
    return matrix, y_label
def do_run(args, data_path, match_score, vfs_list, cons_run):
    """
    Perform a SeqFindr run
    """
    matrix, y_label, exist_ord = [], [], []
    in_files = util.get_fasta_files(data_path)
    # Reorder if requested
    if args.index_file is not None:
        in_files = util.order_inputs(args.index_file, in_files)
    # Handle and existing run
    if args.existing_data is not None:
        cleaned, blastxml_tmp = [], []
        blast_xml = glob.glob(os.path.abspath(args.existing_data)+"/BLAST_results/*")
        if not cons_run:
            for e in blast_xml:
                if e.find("cons_DB=") == -1:
                    blastxml_tmp.append(e)
        else:
            for e in blast_xml:
                if e.find("cons_DB=") != -1:
                    blastxml_tmp.append(e)
        blast_xml = []
        blast_xml = blastxml_tmp
        for e in blast_xml:
            sid = e.split("ID=")[-1].split("_blast.xml")[0]
            cleaned.append(sid)
        for i in in_files:
            for j in cleaned:
                if i.find(j) != -1:
                    y_label.append(j)
                    break
        for i in blast_xml:
            for j in y_label:
                if i.find(j) != -1:
                    exist_ord.append(i)
                    break
        in_files = exist_ord
    # Make sure XML are right order
    for idx, subject in enumerate(in_files):
        if args.existing_data is None:
            strain_id = blast.make_BLAST_database(subject)
            y_label.append(strain_id)
            database = os.path.basename(subject)
            blast_xml = blast.run_BLAST(args.seqs_of_interest, os.path.join(os.getcwd(), "DBs/"+database), args, cons_run)
            accepted_hits = blast.parse_BLAST(blast_xml, float(args.tol), args.careful)
        else:
            strain_id = y_label[idx]
            accepted_hits = blast.parse_BLAST(in_files[idx], float(args.tol), args.careful)
        row = build_matrix_row(vfs_list, accepted_hits, match_score)
        row.insert(0, strain_id)
        matrix.append(row)
    return matrix, y_label
def strip_bases(args):
    """
    Strip the 1st and last 'N' bases from mapping consensuses

    Uses:
        * args.cons
        * args.seqs_of_interest
        * arg.strip

    To avoid the effects of lead in and lead out coverage resulting in
    uncalled bases

    :param args: the argparse args containing args.strip value

    :type args: argparse args

    :rtype: the updated args to reflect the args.cons &
            args.seqs_of_interest location
    """
    # Get in the fasta files in the consensus directory
    fasta_in = util.get_fasta_files(args.cons)
    # Build a stripped directory
    new_cons_dir = os.path.join(args.cons, 'stripped')
    try:
        os.mkdir(new_cons_dir)
    except OSError:
        sys.stderr.write("A stripped directory exists. Overwriting\n")
    # Update the args.cons to the stripped directory
    args.cons = new_cons_dir
    args.strip = int(args.strip)
    # Strip the start and end
    for fa in fasta_in:
        tmp = os.path.basename(fa)
        out = os.path.join(args.cons, tmp)
        with open(fa, "rU") as fin, open(out, 'w') as fout:
            records = SeqIO.parse(fin, "fasta")
            for rec in records:
                rec.seq = rec.seq[args.strip:-args.strip]
                SeqIO.write(rec, fout, "fasta")
    # Trim the db as well
    tmp = args.seqs_of_interest.split('.')
    stripdb = '.'.join(tmp[:-1])+'_trimmed.'+tmp[-1]
    with open(args.seqs_of_interest, "rU") as fin, open(stripdb, 'w') as fout:
        records = SeqIO.parse(fin, "fasta")
        for rec in records:
            rec.seq = rec.seq[args.strip:-args.strip]
            SeqIO.write(rec, fout, "fasta")
    # Update the args.seqs_of_interest
    args.seqs_of_interest = stripdb
    return args
Esempio n. 4
0
def strip_bases(args):
    """
    Strip the 1st and last 'N' bases from mapping consensuses

    Uses:
        * args.cons
        * args.seqs_of_interest
        * arg.strip

    To avoid the effects of lead in and lead out coverage resulting in
    uncalled bases

    :param args: the argparse args containing args.strip value

    :type args: argparse args

    :rtype: the updated args to reflect the args.cons &
            args.seqs_of_interest location
    """
    # Get in the fasta files in the consensus directory
    fasta_in = util.get_fasta_files(args.cons)
    # Build a stripped directory
    new_cons_dir = os.path.join(args.cons, 'stripped')
    try:
        os.mkdir(new_cons_dir)
    except OSError:
        sys.stderr.write("A stripped directory exists. Overwriting\n")
    # Update the args.cons to the stripped directory
    args.cons = new_cons_dir
    args.strip = int(args.strip)
    # Strip the start and end
    for fa in fasta_in:
        tmp = os.path.basename(fa)
        out = os.path.join(args.cons, tmp)
        with open(fa, "rU") as fin, open(out, 'w') as fout:
            records = SeqIO.parse(fin, "fasta")
            for rec in records:
                rec.seq = rec.seq[args.strip:-args.strip]
                SeqIO.write(rec, fout, "fasta")
    # Trim the db as well
    tmp = args.seqs_of_interest.split('.')
    stripdb = '.'.join(tmp[:-1]) + '_trimmed.' + tmp[-1]
    with open(args.seqs_of_interest, "rU") as fin, open(stripdb, 'w') as fout:
        records = SeqIO.parse(fin, "fasta")
        for rec in records:
            rec.seq = rec.seq[args.strip:-args.strip]
            SeqIO.write(rec, fout, "fasta")
    # Update the args.seqs_of_interest
    args.seqs_of_interest = stripdb
    return args