Beispiel #1
0
def file_shuffler(infwd, inrev, outfile):
    filetype = fasta_or_fastq(infwd)
    if filetype == 'unknown':
        sys.exit('File type not recognised' + infwd)

    lineskip = {'fasta': 2, 'fastq': 4}[filetype]

    f_infwd = mh12_utils.open_file_read(infwd)
    f_inrev = mh12_utils.open_file_read(inrev)
    f_out = mh12_utils.open_file_write(outfile)

    line_fwd = f_infwd.readline()
    line_rev = f_inrev.readline()

    while line_fwd:
        for i in range(lineskip):
            f_out.write(line_fwd)
            line_fwd = f_infwd.readline()

        for i in range(lineskip):
            f_out.write(line_rev)
            line_rev = f_inrev.readline()

    f_infwd.close()
    f_inrev.close()
    f_out.close()
Beispiel #2
0
def fasta2singleLine(infile, outfile):
    overwrite = False

    if infile == outfile:
        overwrite = True
        outfile = outfile + '-tmp-' + str(random.randint(0, 1000000))
        if infile.endswith('.gz'):
            outfile = outfile + '.gz'

        if os.path.exists(outfile):
            sys.exit('Yowzer! Unlikely to happen, but ' + outfile +
                     ' already exists.  Aborting')

    regex = re.compile('^\d')

    fasta = True

    # determine if it's fasta or a fasta.qual file
    f_in = mh12_utils.open_file_read(infile)

    f_in.readline()
    tmp = f_in.readline()

    if regex.match(tmp):
        fasta = False

    f_in.close()

    f_in = mh12_utils.open_file_read(infile)
    f_out = mh12_utils.open_file_write(outfile)

    first = True

    for line in f_in:
        if line.startswith("\n"):
            continue
        elif line.startswith(">"):
            if not first:
                f_out.write("\n")
            else:
                first = False

            f_out.write(line)
        else:
            f_out.write(line.rstrip())

            if not fasta:
                f_out.write(' ')

    f_out.write("\n")

    f_in.close()
    f_out.close()

    if overwrite:
        os.rename(outfile, infile)
Beispiel #3
0
def fastn2subset(infile, ids, outfile, complement=False, start_only=False):
    filetype = fasta_or_fastq(infile)

    if filetype == 'unknown':
        sys.exit('File ' + infile + ' not recognised as a fasta/q')

    f_in = mh12_utils.open_file_read(infile)
    f_out = mh12_utils.open_file_write(outfile)

    while 1:
        seq = get_next_seq_from_file(f_in, filetype)

        if not seq:
            break

        if start_only == '':
            seq.id = seq.id.split()[0]
        elif start_only != False:
            seq.id = seq.id.split(start_only)[0]

        if (not complement and seq.id in ids) or (complement
                                                  and seq.id not in ids):
            print >> f_out, seq

    f_in.close()
    f_out.close()
Beispiel #4
0
def fastn_splitter(fname, outprefix):
    file_list = []
    filetype = fasta_or_fastq(fname)
    if filetype == 'unknown':
        sys.exit('Unknown file format of ' + fname +
                 ' in method fastn.fastn_splitter')

    f_in = mh12_utils.open_file_read(fname)

    # loop through file, writing each sequence to new file
    while 1:
        seq = get_next_seq_from_file(f_in, filetype)

        if not seq:
            break

        outname = outprefix + seq.id.replace(' ', '_')[1:] + '.' + filetype
        f_out = mh12_utils.open_file_write(outname)
        print >> f_out, seq
        f_out.close()
        file_list.append(outname)

    f_in.close()

    return file_list
Beispiel #5
0
def fastn2lengthdic(fname, d, min_length=1, ignoreN=False, first_only=False):
    filetype = fasta_or_fastq(fname)
    if filetype == "unknown":
        sys.exit("Unknown file format of " + fname +
                 " in method mh12_utils.fastn2lengthdic")

    f = mh12_utils.open_file_read(fname)

    while 1:
        seq = get_next_seq_from_file(f, filetype)

        if not seq:
            break

        if len(seq) < min_length:
            continue

        if ignoreN:
            seq.seq = seq.seq.replace('N', '')
            seq.seq = seq.seq.replace('n', '')

        if first_only:
            seq.id = seq.id.split()[0]

        d[seq.id] = len(seq)

    f.close()
Beispiel #6
0
def fai2length_hash(filename, d):
    f = mh12_utils.open_file_read(filename)

    for line in f:
        tmp = line.split()
        d[tmp[0]] = int(tmp[1])

    f.close()
Beispiel #7
0
def fasta_or_fastq(filename):
    f = mh12_utils.open_file_read(filename)
    x = f.readline()
    f.close()

    if x.startswith('>'):
        return 'fasta'
    elif x.startswith('@'):
        return 'fastq'
    else:
        return 'unknown'
Beispiel #8
0
def fastn2dictionary(fname, d, first_only=False):
    filetype = fasta_or_fastq(fname)
    f_in = mh12_utils.open_file_read(fname)

    while 1:
        seq = get_next_seq_from_file(f_in, filetype)

        if not seq:
            break

        if first_only:
            seq.id = seq.id.split()[0]

        d[seq.id] = seq

    f_in.close()
Beispiel #9
0
def fasta2multiline(infile, outfile, line_length):
    if infile == outfile:
        sys.exit('infile = outfile in fastn.fasta2multiline. Aborting')

    f_in = mh12_utils.open_file_read(infile)
    f_out = mh12_utils.open_file_write(outfile)

    while 1:
        seq = get_next_seq_from_file(f_in, 'fasta')

        if not seq:
            break

        print >> f_out, seq.multi_line_str(line_length)

    f_in.close()
    f_out.close()
Beispiel #10
0
def fasta_singleline_ok(filename):
    line_count = 0
    seq_count = 0

    f = mh12_utils.open_file_read(filename)

    for line in f:
        line_count += 1

        if line.startswith('>'):
            seq_count += 1

            if 2 * (seq_count - 1) != line_count - 1:
                return False

    f.close()

    return True
Beispiel #11
0
def fastn2uniq(infile, outfile):
    filetype = fasta_or_fastq(infile)
    reads = set()

    f_in = mh12_utils.open_file_read(infile)
    f_out = mh12_utils.open_file_write(outfile)

    while 1:
        seq = get_next_seq_from_file(f_in, filetype)

        if not seq:
            break

        if seq.id not in reads:
            reads.add(seq.id)
            print >> f_out, seq

    f_in.close()
    f_out.close()
Beispiel #12
0
def fastn_split(fname, outprefix, no_of_bases):
    # writes array of sequences to a file
    def write_file(fname, a):
        f = mh12_utils.open_file_write(fname)

        for seq in a:
            print >> f, seq

        f.close()

    filetype = fasta_or_fastq(fname)
    if filetype == 'unknown':
        sys.exit('Unknown file format of ' + fname +
                 ' in function fastn.fastn_split')

    f_in = mh12_utils.open_file_read(fname)

    file_counter = 1
    seqs = [get_next_seq_from_file(f_in, filetype)]
    base_counter = len(seqs[0])

    # loop through file, writing out files when enough data gathered
    while 1:
        next_seq = get_next_seq_from_file(f_in, filetype)

        if not next_seq:
            write_file(outprefix + str(file_counter) + '.' + filetype, seqs)
            break

        # either write out sequences, or add the next one to array
        if base_counter + len(next_seq) >= no_of_bases:
            write_file(outprefix + str(file_counter) + '.' + filetype, seqs)
            seqs = [next_seq]
            base_counter = next_seq.length()
            file_counter += 1
        else:
            seqs.append(next_seq)
            base_counter += next_seq.length()

    f_in.close()

    return file_counter
Beispiel #13
0
if len(parser.rargs) != 3:
    parser.print_help()
    sys.exit(1)

options.infile = parser.rargs[0]
options.txtout = parser.rargs[1]
options.plotout = parser.rargs[2]

filetype = fastn.fasta_or_fastq(options.infile)

if filetype == 'unknown':
    sys.exit('File ' + infile + ' not recognised as a fasta/q')

gc_hist = dict(zip(range(101), [0] * 101))

f_in = mh12_utils.open_file_read(options.infile)
f_out = mh12_utils.open_file_write(options.txtout)

while 1:
    seq = fastn.get_next_seq_from_file(f_in, filetype)

    if not seq:
        break

    if options.window:
        i = 0

        while i < len(seq):
            tmp = fastn.Fasta(seq.id, seq.seq[i:i + options.window])
            gc = tmp.gc()
            gc_hist[floor(gc)] += 1