예제 #1
0
 def unwrap_fasta(infile, outfile, strip_comment=False):
     """
     This method reads fasta sequences from *infile*
     and writes them unwrapped in *outfile*.
     :param str infile: The path to the input FASTA file.
     :param str outfile: The path to the output file.
     """
     with open(outfile, "w") as fasta_out:
         if strip_comment:
             FastaIO.FastaWriter(
                 fasta_out, wrap=None,
                 record2title=Fastq2Fasta.just_name).write_file(
                     SeqIO.parse(infile, 'fasta'))
         else:
             FastaIO.FastaWriter(fasta_out, wrap=None).write_file(
                 SeqIO.parse(infile, 'fasta'))
예제 #2
0
def hashing(unhashed_otu_table_list, unhashed_rep_seqs_list,
            sample_metadata_list):
    otu_df_list = []
    rep_seq_ids = set()
    seqs = []
    # Create OTU table
    for unhashed_otu_table in unhashed_otu_table_list:
        otu_df_list.append(hash_otu_table(unhashed_otu_table))
    otu_df = pd.concat(otu_df_list, join="outer", axis=1)
    otu_df.fillna(0.0, inplace=True)
    otu_table = Table(otu_df.values, list(otu_df.index), list(otu_df.columns))
    # Create rep seqs
    for unhashed_rep_seqs in unhashed_rep_seqs_list:
        seqs.extend(hash_rep_seqs(unhashed_rep_seqs, rep_seq_ids))
    otu_table_ids = set(otu_df.index)
    assert otu_table_ids == rep_seq_ids
    assert len(otu_df.index) == len(rep_seq_ids)
    # Merge sample metadata
    sample_metadata = pd.concat(
        [pd.read_csv(s, sep="\\t") for s in sample_metadata_list])
    # Write files
    sample_metadata.to_csv("sample_metadata.tsv", sep="\\t", index=False)
    with biom_open("otu_table.biom", "w") as fid:
        otu_table.to_hdf5(fid,
                          "Constructed by micone in dada2/deblur pipeline")
    with open("rep_seqs.fasta", "w") as fid:
        fasta_writer = FastaIO.FastaWriter(fid, wrap=None)
        fasta_writer.write_file(seqs)
def first_occurrences_only(fasta_in):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    out_filepath = os.path.join(
        out_basedir, in_fasta_basename + "_first_occurrences_only.fasta")

    total_seq_count = 0

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            total_seq_count += 1
            record_hash = hashlib.sha256(str(
                record.seq).encode("UTF-8")).hexdigest()
            if record_hash not in hashes_seen_before:
                hashes_seen_before.add(record_hash)
                fasta_out.write_record(record)
            else:
                pass
                #print("{} is identical to a sequence earlier in the input file; skipping...".format(record.id))

    print("{} seqs seen".format(total_seq_count))
    print("{} unique seqs found".format(len(hashes_seen_before)))
    print("{} identical duplicates removed".format(total_seq_count -
                                                   len(hashes_seen_before)))
def remap_tax_id(fasta_in, remap_table):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    out_filepath = os.path.join(out_basedir,in_fasta_basename+"_annotated_for_beast.fasta")

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    id_map = dict()
    with open(remap_table, "r") as map_handle:
        for line in map_handle:
            seqid,*other_fields = line.split("\t")
            if seqid in id_map:
                raise LookupError("%s already found in map" % seqid)
            if seqid=="taxa":
                # if this is a figtree-formatted annotation file, skip the header row
                # that has nothing but column labels
                continue
            id_map[seqid] = other_fields

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            if record.id in id_map:
                if sum([len(x) for x in id_map[record.id]])>0:
                    new_description="|".join(id_map[record.id]).replace("||","|?|").replace("||","|?|")
                    record.description=new_description
                    fasta_out.write_record(record)
            else:
                print("Warning: '{}' not found in {}".format(record.id,os.path.basename(remap_table)))
예제 #5
0
def write_fasta_1line(input_fasta_file, output_fasta_file):
    with open(input_fasta_file, "r") as handle:
        record_list = list(SeqIO.parse(handle, "fasta"))
        print(input_fasta_file)
        print("Number of protein sequences: ", len(record_list))
    with open(output_fasta_file, "w") as handle:
        fasta_writer = FastaIO.FastaWriter(handle, wrap=None)
        fasta_writer.write_file(record_list)
def Main():
    parser = argparse.ArgumentParser(description='Generate synthetic reads.',
                                     fromfile_prefix_chars='@')
    parser.add_argument("-t",
                        "--num_transpositions",
                        type=int,
                        default=50,
                        help="Number of transpositions to generate")
    parser.add_argument("-r",
                        "--num_reads",
                        type=int,
                        default=10000,
                        help="Number of reads to generate per transposition.")
    parser.add_argument("-l",
                        "--read_length",
                        type=int,
                        default=100,
                        help="Number of reads to generate per transposition.")
    parser.add_argument("-o",
                        "--output_fname",
                        default='generated_transposition_reads.fa',
                        help="Where to write FASTA output to.")
    TranspositionParams.AddArgs(parser)
    args = parser.parse_args()

    tn_params = TranspositionParams.FromArgs(args)
    insert_gen = InsertGenerator.FromTranspositionParams(tn_params)

    n_trans = args.num_transpositions
    n_reads = args.num_reads
    read_len = args.read_length
    print 'Generating %d random transpositions with %d random %d NT reads each' % (
        n_trans, n_reads, read_len)

    print 'Writing generated reads to FASTA'
    x = 0
    with open(args.output_fname, 'w') as fh:
        writer = FastaIO.FastaWriter(fh)
        writer.write_header()
        for construct_num in xrange(n_trans):
            trans = Transposition(construct_num, insert_gen,
                                  tn_params.backbone_seq,
                                  tn_params.backbone_start_offset)
            for read_num in xrange(n_reads):
                frag = trans.Shear(read_num, read_len)
                record = frag.ToSeqRecord()
                writer.write_record(record)

                x += 1
                if x % 1000000 == 0:
                    print "Created %d reads" % x

        writer.write_footer()

    print 'Zipping generated reads.'
    gzip_fname = '%s.gz' % args.output_fname
    with GzipFile(gzip_fname, mode='w') as gzipf:
        gzipf.write(args.output_fname)
def export_dna_record(gene_seq, gene_id, gene_description, output_handle):
    seq_object = Seq(gene_seq, IUPAC.unambiguous_dna)
    seq_record = SeqRecord(seq_object)
    seq_record.id = gene_id
    seq_record.description = gene_description
    fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
    fasta_out.write_header()
    fasta_out.write_record(seq_record)
    fasta_out.write_footer()
def write(data, filename):
    records = []
    with open(filename, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        for element in data:
            sequence = SeqRecord(Seq(element['seq']), 
                                    id=element['id'], 
                                    description=element['description'])
            records.append(sequence)
        fasta_out.write_file(records)
예제 #9
0
def recomp(input, output, both=False):
    fasta_out = FastaIO.FastaWriter(output, wrap=None)
    fasta_out.write_header()
    for seq_record in SeqIO.parse(input, "fasta"):
        rc_rec = seq_record.reverse_complement(id=seq_record.id + "_RC",
                                               description="")
        if both == True:
            fasta_out.write_record(seq_record)
        fasta_out.write_record(rc_rec)
    fasta_out.write_footer()
예제 #10
0
def get_fasta(pdb_file, fasta_file, transfer_ids=None):
    fasta_writer = FastaIO.FastaWriter(fasta_file)
    fasta_writer.write_header()
    for rec in PdbIO.PdbSeqresIterator(pdb_file):
        if len(rec.seq) == 0:
            continue
        if transfer_ids is not None and rec.id not in transfer_ids:
            continue
        print(rec.id, rec.seq, len(rec.seq))
        fasta_writer.write_record(rec)
예제 #11
0
def perform_mapping(mapping_dir,
                    og_files,
                    threshold=1,
                    exclude_species=["none"]):
    og_dict = {}
    '''read in og with aa seq'''
    og = list(SeqIO.parse(og_files, "fasta"))
    for record in og:
        key = record.description.split(" | ")[-1]
        if key in og_dict:
            ids = [rec.id for rec in og_dict[key]]
            if record.id not in ids:
                og_dict[key].append(record)
        else:
            og_dict[key] = []
            og_dict[key].append(record)

    # parse the mapped reads to ogs to dictionary
    all_dict = {}
    for file in glob.glob(mapping_dir + "*.fa"):
        og_name = file.split("_")[-1].split(".")[0]
        og = og_dict[og_name]

        # change ids to species names
        for i, record in enumerate(og):
            s = record.id[0:5]
            record.id = s

        # find the best representative seq based by mapping
        mapping = list(SeqIO.parse(file, "fasta"))
        best_translated_seq = find_best_translation_by_similarity(
            mapping, og[0], exclude_species=exclude_species)
        if best_translated_seq is not None:
            og.append(best_translated_seq)
            if get_coverage(og) <= threshold:
                all_dict[og_name] = og

    if threshold is not 1:
        OG_OUT = mapping_dir + 'og' + str(threshold) + "/"
    elif exclude_species[0] is not "none":
        OG_OUT = mapping_dir + 'og_without' + "_".join(exclude_species) + "/"
    else:
        OG_OUT = mapping_dir + 'og/'

    if not os.path.exists(OG_OUT):
        os.makedirs(OG_OUT)

    for key, item in all_dict.items():
        file_name = OG_OUT + key + ".fa"
        fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
        fasta_out.write_file(item)

    print("FINISHED OG RECONSTRUCTION!")
    return all_dict
예제 #12
0
def get_seq_from_ids(inputfile=None, outputfile=None, id_file=None):

    id_list = id_file.readlines()
    id_list = [x.replace(">", "").rstrip("\n") for x in id_list]

    record_dict = SeqIO.to_dict(SeqIO.parse(inputfile.name, "fasta"))

    fasta_out = FastaIO.FastaWriter(outputfile, wrap=None)

    for i in id_list:
        print(">" + record_dict[i].id)
        print(record_dict[i].seq)
예제 #13
0
파일: hashing.py 프로젝트: Vikash84/MiCoNE
def hash_rep_seqs(unhashed_rep_seqs, output_file):
    seqs = list(SeqIO.parse(unhashed_rep_seqs, "fasta"))
    seq_ids = []
    for seq in seqs:
        seq.id = hash_function(str(seq.seq))
        seq_ids.append(seq.id)
        seq.description = ""
        seq.name = ""
    with open(output_file, "w") as fid:
        fasta_writer = FastaIO.FastaWriter(fid, wrap=None)
        fasta_writer.write_file(seqs)
    return seq_ids
예제 #14
0
def write_fasta_with_sanitized_ids(fasta_in, out_filepath):
    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            record.id=sanitize_id_for_sam_rname(record.id)
            fasta_out.write_record(record)
    print("out_filepath",out_filepath)
    print("os.path.dirname(out_filepath)",os.path.dirname(out_filepath))
    print("ls -lah")
    for line in subprocess.check_output(["ls","-lah",os.path.dirname(out_filepath)]).decode("utf-8").split("\n"):
        print(line)
    return out_filepath
def MakeConcensusAlignment(pair):
    write_tmpfasta = "tmp.fasta"
    handle = open(write_tmpfasta, "w")
    writer = FastaIO.FastaWriter(handle, wrap=None)
    writer.write_file(pair)
    handle1.close

    while os.path.exists('tmp.fasta') == False:
        time.sleep(1)
    command = "mafft " + write_tmpfasta + " > tmp_2.fasta"
    print(command)
    subprocess.call(command, shell=True)

    return ()
예제 #16
0
def write_records(records, output_file):
    """ Writes FASTA records (BioPython SeqRecord) to a file.

        Parameters
        ----------
        records : list
            List with BioPython SeqRecord objects.
        output_file : str
            Path to the output file.
    """

    with open(output_file, 'w') as output_handle:
        fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
        fasta_out.write_file(records)
예제 #17
0
def convert_multiline_to_single_line_FASTA():
    sequences = []
    input_handle = open("preads4falcon.fasta", "rU")

    for record in SeqIO.parse(input_handle, "fasta"):
        sequences.append(record)
        global read_len_dict
        read_len_dict[record.id] = len(record.seq)
        record_complement = (record.id) + "'"
        read_len_dict[record_complement] = len(record.seq)
    output_handle = open("formatted_preads4falcon.fasta", "w")
    fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
    fasta_out.write_file(sequences)
    output_handle.close()
예제 #18
0
def single_line_records(fasta_in):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    out_filepath = os.path.join(out_basedir,
                                in_fasta_basename + "_single_lines.fasta")

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            fasta_out.write_record(record)
예제 #19
0
def main(args):
    COUNT_SEPARATOR = '_x'
    seqs = {}

    for seq_record in SeqIO.parse(args.input_fasta, "fasta"):
        split_seqid = seq_record.id.split(COUNT_SEPARATOR)
        if len(split_seqid) == 1:
            seqs[split_seqid[0]] = {}
            seqs[split_seqid[0]]['seq'] = seq_record.seq
            count = 1
            seqs[split_seqid[0]]['count'] = count
        elif len(split_seqid) == 2:
            seqs[split_seqid[0]] = {}
            seqs[split_seqid[0]]['seq'] = seq_record.seq
            count = int(split_seqid[1])
            seqs[split_seqid[0]]['count'] = count
        else:
            logging.error("Error parsing: ", seq_record.id)

    # combinations('ABCD', 2) gives:
    # AB AC AD BC BD CD
    # ie. we don't need to compare AB and BA
    for seq1, seq2 in combinations(seqs, 2):
        # Need to skip over sequences that have been removed on previous iterations
        if seq1 not in seqs or seq2 not in seqs:
            continue

        # Translate each pair of seqs
        try:
            seq1_translated = seqs[seq1]['seq'].translate()
        except TranslationError:
            print("Error translating: " + seq1, file=sys.stderr)
        try:
            seq2_translated = seqs[seq2]['seq'].translate()
        except TranslationError:
            print("Error translating: " + seq2, file=sys.stderr)
    
        # Remove seq2 from collection of seqs if it translates to the
        # same amino acid sequence as seq1. Add the counts for seq2 to seq1 before removing.
        if seq1_translated == seq2_translated:
            print(seq1, " translates identicaly to ", seq2, ". Deleting ", seq2)
            seqs[seq1]['count'] += seqs[seq2]['count']
            del seqs[seq2]

    fasta_out = FastaIO.FastaWriter(open('output.fa', 'w'), wrap=None)
    fasta_out.write_file(
        (SeqRecord(seqs[seq]['seq'], id=seq + COUNT_SEPARATOR + str(seqs[seq]['count']), description="") for seq in seqs)
    )
def split_records(fasta_in):
    for record in SeqIO.parse(fasta_in, "fasta"):
        in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
        out_basedir = os.path.realpath(
            os.path.join(os.path.dirname(fasta_in), in_fasta_basename))
        if not os.path.isdir(out_basedir):
            os.makedirs(out_basedir, exist_ok=True)
        out_filepath = os.path.join(out_basedir, record.id + ".fasta")
        print("%s %i -> %s" % (record.id, len(record), out_filepath))
        if not os.path.exists(out_filepath):
            with open(out_filepath, "w") as handle:
                fasta_out = FastaIO.FastaWriter(handle, wrap=None)
                fasta_out.write_header()
                fasta_out.write_record(record)
                #SeqIO.write(record, handle, "fasta")
        else:
            #raise IOError("%s already exists; skipping..." % out_filepath)
            print("%s already exists; skipping..." % out_filepath)
예제 #21
0
def main():

    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "r:d:o:",
            ["mapped_reads=", "ref_data=", "out_folder="])
    except getopt.GetoptError as e:
        print(str(e))
        print(
            'concat_alignments.py -r <mapped_reads> -d <ref_data> -o <out_folder>'
        )
        sys.exit(2)

    mapped_reads = None
    ref_data = None
    out_folder = None

    for opt, arg in opts:
        if opt == '-h':
            print(
                'concat_alignments.py -r <mapped_reads> -d <ref_data> -o <out_folder>'
            )
            sys.exit()
        elif opt in ("-r", "--reads"):
            mapped_reads = arg
        elif opt in ("-d", "--ref_data"):
            ref_data = arg
        elif opt in ("-o", "--out_folder"):
            out_folder = arg
        else:
            assert False, "unhandled option"

    read_mappings = list(SeqIO.parse(mapped_reads, "fasta"))
    og_data = list(SeqIO.parse(ref_data, "fasta"))

    if out_folder[-1] is not "/":
        out_folder += "/"

    list_of_ogs = get_ogs(read_mappings, og_data)
    if list_of_ogs is not None:
        for og in list_of_ogs:
            file_name = out_folder + og + ".fasta"
            fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
            fasta_out.write_file(list_of_ogs[og])
def subset_to_ids_not_in_file(fasta_in, ids_file, fasta_out):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    #out_filepath = os.path.join(out_basedir,in_fasta_basename+"_subset.fasta")
    out_filepath = fasta_out

    ids_to_include = set()

    with open(ids_file) as ids_file:
        for line in ids_file:
            ids_to_include.add(line.rstrip().replace("\n", ""))

    #if os.path.exists(out_filepath):
    #    raise IOError("%s already exists; skipping..." % out_filepath)

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            if record.id not in ids_to_include:
                fasta_out.write_record(record)
def write_and_drop_seqs(fasta_in,
                        fasta_out,
                        gap_threshold=None,
                        ambig_threshold=None):
    print("ambig_threshold", ambig_threshold)
    print("gap_threshold", gap_threshold)

    with open(fasta_out, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=80)  # wrap=None
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in.name, "fasta"):
            ambig_fraction = record.seq.count("N") / float(len(record))
            gap_fraction = record.seq.count("-") / float(len(record))

            if (ambig_threshold != None and ambig_fraction > ambig_threshold
                ) or (gap_threshold != None and gap_fraction > gap_threshold):
                print("omitting", record.id, "ambig:", ambig_fraction, "gap:",
                      gap_fraction)
                continue
            else:
                #print("writing",record.id,"ambig:",ambig_fraction,"gap:",gap_fraction,"gap_chrs:",record.seq.count("-"))
                fasta_out.write_record(record)
예제 #24
0
 def testSerialize(self):
     insert_gen = InsertGenerator(self.INSERT_SEQ, self.FIXED_5P, self.FIXED_3P,
                                  extra_bp_5p='T', linker_gen=self.LINKER_GEN)
     
     records = []
     
     for tn_id in xrange(10):
         tn_gen = Transposition(tn_id, insert_gen, self.TARGET, self.ORF_START)
         records.extend([tn_gen.Shear(i).ToSeqRecord() for i in xrange(100)])
     
     outfile = StringIO()
     writer = FastaIO.FastaWriter(outfile)
     writer.write_header()
     writer.write_records(records)
     
     # Parse the generated output.
     infile = StringIO(outfile.getvalue())
     parsed = SeqIO.parse(infile, 'fasta')
     expected_info_keys = Fragment.INFO_DICT_KEYS
     for record in parsed:
         info_dict = Fragment.ParseInfoDict(record)
         self.assertListEqual(sorted(info_dict.keys()), sorted(expected_info_keys))
예제 #25
0
def perform_mapping(DIR_MAPPING, FILE_OGS):
    og_dict = {}
    '''read in og with aa seq'''
    og = list(SeqIO.parse(FILE_OGS, "fasta"))
    for record in og:
        key = record.description.split(" | ")[-1]
        if key in og_dict:
            ids = [rec.id for rec in og_dict[key]]
            if record.id not in ids:
                og_dict[key].append(record)
        else:
            og_dict[key] = []
            og_dict[key].append(record)


    # parse the mapped reads to ogs to dictionary
    all_dict = {}
    for file in glob.glob(DIR_MAPPING + "*.fa"):
        og_name = file.split("_")[-1].split(".")[0]
        og = og_dict[og_name]

        # change ids to species names
        for i, record in enumerate(og):
            s = record.id[0:5]
            record.id = s
        all_dict[og_name] = og

    OG_OUT = DIR_MAPPING + 'origin_og/'
    if not os.path.exists(OG_OUT):
        os.makedirs(OG_OUT)

    for key, item in all_dict.items():
        file_name = OG_OUT + key + ".fa"
        fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
        fasta_out.write_file(item)

    print("FINISHED PARSING OGs!")
    return all_dict
예제 #26
0
def regenerate():
  from Bio import SeqIO
  from Bio.SeqIO import FastaIO
  import argparse

  parser = argparse.ArgumentParser(description='''
  Change the IDs of all fasta entries (contigs) to prefixed, sanitized IDs.
  The output format of each new ID is: PREFIX _ 'C' _ OLD-ID
  ''')
  parser.add_argument("--prefix", dest='dataset_id', required=True, help='ID prefix')
  parser.add_argument("--input", dest='input_fasta', required=True, help='fasta input file')
  parser.add_argument("--output", dest='output_fasta', required=True, help='fasta output file')
  args = parser.parse_args()

  with open(args.input_fasta, 'r') as sourceFile:
    with open(args.output_fasta, 'w') as destFile:
      dest = FastaIO.FastaWriter(destFile, wrap=None)
      dest.write_header()
      for record in SeqIO.parse(sourceFile, "fasta"):
        new_id = args.dataset_id + '_C_' + extract_unique_element_from_contigid(record.id)
        record.id = new_id
        record.description = ""  # any comments after contig id are removed
        dest.write_record(record)
def remap_tax_id(fasta_in, remap_table):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    out_filepath = os.path.join(out_basedir,
                                in_fasta_basename + "_remapped_taxids.fasta")

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    id_map = dict()
    with open(remap_table, "r") as map_handle:
        for line in map_handle:
            old, new = line.split("\t")
            if old in id_map:
                raise LookupError("%s already found in map" % old)
            id_map[old] = new

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            record.id = id_map[record.id]
            fasta_out.write_record(record)
예제 #28
0
#!/usr/bin/env python

# (c) Christian Henke
# Licensed under Apache License 2.0
# https://www.apache.org/licenses/LICENSE-2.0

from Bio import SeqIO
from Bio.SeqIO import FastaIO
import argparse

parser = argparse.ArgumentParser(description='''
Remove sequences from fasta that are shorter than the desired minimum length.
''')
parser.add_argument("--input", dest='input_fasta', required=True, help='fasta input file')
parser.add_argument("--output", dest='output_fasta', required=True, help='fasta output file')
parser.add_argument("--min-seq-length", dest='min_seq_len', type=int, required=True, help='minimum sequence length (integer)')
args = parser.parse_args()

with open(args.input_fasta, 'r') as sourceFile:
  with open(args.output_fasta, 'w') as destFile:
    dest = FastaIO.FastaWriter(destFile, wrap=None)
    dest.write_header()
    for record in SeqIO.parse(sourceFile, "fasta"):
      if len(record.seq) >= args.min_seq_len:
        dest.write_record(record)
예제 #29
0
names = []
for seq in proteins:
    names.append(seq.id)

orfs = []
for i in names:
    for j in nucleotides:
        if i == j.id:
            orfs.append(j)

#for seq in orfs:
#	seq.description=""

handle = open(output + '_QMS_DB.fasta', "w")
writer = FastaIO.FastaWriter(handle, wrap=None)
writer.write_file(orfs)
handle.close()

###############################################

print(
    "\n::::::: BLASTing ORFs and appending hit to description :::::::\n\n v Ignore this warning v \n"
)

orfs_name = output + "_QMS_DB.fasta"
blast_xml = output + "_blast.xml"

command = blast_path + blast_type + ' -query ' + orfs_name + ' -db ' + blast_database + ' -outfmt 5 -num_threads ' + str(
    num_threads) + ' -max_target_seqs 1 -evalue 0.0001 -out ' + blast_xml
subprocess.call(command, shell=True)
예제 #30
0
def transform_file(source_file, destination_file, arguments):
    # Get just the file name, useful for naming the temporary file.
    source_file_type = (arguments.input_format or from_handle(source_file))

    destination_file_type = (arguments.output_format or
            from_handle(destination_file))

    # Get an iterator.
    sorters = {'length': transform.sort_length,
               'name': transform.sort_name,}
    directions = {'asc': 1, 'desc': 0}
    if arguments.sort:
        # Sorted iterator
        key, direction = arguments.sort.split('-')
        records = sorters[key](source_file=source_file,
                source_file_type=source_file_type,
                direction=directions[direction])
    else:
        # Unsorted iterator.
        records = SeqIO.parse(source_file, source_file_type,
                alphabet=ALPHABETS.get(arguments.alphabet))


    #########################################
    # Apply generator functions to iterator.#
    #########################################

    # Apply all the transform functions in transforms
    if arguments.transforms:

        # Special case handling for --cut and --relative-to
        if arguments.cut_relative:
            for o, n in ((transform.multi_cut_sequences,
                          transform.cut_sequences_relative),
                         (transform.multi_mask_sequences,
                          transform.mask_sequences_relative)):
                # Add a function to trim any columns which are gaps in the
                # sequence ID
                try:
                    f = next(f for f in arguments.transforms
                             if f.func == o)
                except StopIteration:
                    continue
                i = arguments.transforms.index(f)
                arguments.transforms.pop(i)
                arguments.transforms.insert(i,
                        functools.partial(n,
                            record_id=arguments.cut_relative, **f.keywords))

        for function in arguments.transforms:
            records = function(records)

    if (arguments.deduplicate_sequences or
            arguments.deduplicate_sequences is None):
        records = transform.deduplicate_sequences(
            records, arguments.deduplicate_sequences)

    # Apply all the partial functions
    if arguments.apply_function:
        for apply_function in arguments.apply_function:
            records = apply_function(records)

    # Only the fasta format is supported, as SeqIO.write does not have a 'wrap'
    # parameter.
    if (arguments.line_wrap is not None and destination_file_type == 'fasta'):
        logging.info("Attempting to write fasta with %d line breaks.",
                arguments.line_wrap)

        with destination_file:
            writer = FastaIO.FastaWriter(
                destination_file, wrap=arguments.line_wrap)
            writer.write_file(records)
    else:
        # Mogrify requires writing all changes to a temporary file by default,
        # but convert uses a destination file instead if one was specified. Get
        # sequences from an iterator that has generator functions wrapping it.
        # After creation, it is then copied back over the original file if all
        # tasks finish up without an exception being thrown.  This avoids
        # loading the entire sequence file up into memory.
        logging.info("Applying transformations, writing to %s",
                destination_file)
        SeqIO.write(records, destination_file, destination_file_type)