Esempio n. 1
0
def sequence_from_bedfile(fastafile,
                          features=None,
                          bedfile=None,
                          pad5=0,
                          pad3=0):
    """Fasta sequences from set of genomic features in a bed file
        Args:
            fastafile: fasta file with genomic sequence
            features: dataframe of features/coords with bed file col names
            bedfile: optionally provide a bed file instead
            pad5,pad3: flanking sequence at 5' or 3' ends
        Returns:
            a pandas dataframe with name, sequence and coord columns"""

    from pybedtools import BedTool
    if bedfile != None:
        features = utils.bed_to_dataframe(bedfile)
    new = []
    for n, r in features.iterrows():
        if r.strand == '+':
            coords = (r.chr, r.chromStart - pad5, r.chromEnd + pad3)
            seq = str(BedTool.seq(coords, fastafile))
        else:  #reverse strand
            coords = (r.chr, r.chromStart - pad3, r.chromEnd + pad5)
            seq = str(BedTool.seq(coords, fastafile))
            seq = HTSeq.Sequence(seq).get_reverse_complement()
        #print n, coords, r['name']
        new.append([r['name'], str(seq), coords])
    new = pd.DataFrame(new, columns=['name', 'seq', 'coords'])
    return new
Esempio n. 2
0
def test_pickle():
    import pickle

    print('Test pickling and inpickling')
    pickles = [
        {
            'name': 'HTSeq.Sequence',
            'object': HTSeq.Sequence(b'ACTG', 'sequence'),
            'assert_properties': ('seq', 'name', 'descr')
        },
    ]

    for pic in pickles:
        print('Pickling ' + pic['name'])
        pickled = pickle.dumps(pic['object'])
        print('Done')

        print('Unpickling ' + pic['name'])
        unpick = pickle.loads(pickled)
        print('Done')

        if 'assert_properties' in pic:
            print('Checking serialized/deserialized')
            for prop in pic['assert_properties']:
                assert getattr(pic['object'], prop) == getattr(unpick, prop)
            print('Done')
    print("Test passed")
Esempio n. 3
0
def get_consensus(sites, seq_type, m=-1):
    """return a string as consesus sequence """

    bases = ['A', 'C', 'G', 'T', 'N']
    n = len(sites)  # number of sites
    if m == -1 and sites: m = len(sites[0][seq_type])

    # initialize count array
    # rows correspond to positions in motif sequence
    # columns correspond to bases: "A", "C", "G", "T", and "N"
    counts = np.zeros((m, 5), np.int)
    consenus = ""
    for s in sites:

        # convert seq to HTSeq.Sequence object
        seq = HTSeq.Sequence(str(s[seq_type]))
        # count bases to counts array
        seq.add_bases_to_count_array(counts)

    base_idx = np.argmax(counts, 1)

    for i in range(m):

        # test if at least 75% of sites have same base:
        if n > 0 and counts[i, base_idx[i]] / float(n) > 0.75:
            consenus += bases[base_idx[i]]

        # test if at least 50% of sites have same base:
        elif n > 0 and counts[i, base_idx[i]] / float(n) > 0.5:
            consenus += bases[base_idx[i]].lower()
        else:
            consenus += "."

    return consenus
Esempio n. 4
0
def filter_fasta(infile):

    fastafile = HTSeq.FastaReader(infile)
    sequences = [(s.name, s.seq, s.descr) for s in fastafile]
    out = open('filtered.fa', "w")
    for s in sequences:
        if s[1] == 'Sequence unavailable':
            continue
        myseq = HTSeq.Sequence(s[1], s[0])
        myseq.write_to_fasta_file(out)
    return
Esempio n. 5
0
def readdna(filename):
    """
    Reads in the dna sequence of the given fasta

    @type  filename: string
    @param filename: Fasta-file used as input.
    @rtype:   HTSeq Sequence object
    @return:  Reference Fasta.
    """
    chr = HTSeq.FastaReader(filename)
    for fasta in chr:
        referenz = HTSeq.Sequence(fasta.seq, fasta.name)
    return (referenz)
Esempio n. 6
0
def demultiplex(infile, outfile, sequences, seq2regex):
    fastq_file = HTSeq.FastqReader(infile)
    with open(outfile, "w+") as outf:
        for read in fastq_file:
            for sequence in sequences:
                if 'r' not in sequence:
                    continue
                match = re.search(seq2regex[sequence], read.seq)
                if not match:
                    continue
                barcode = HTSeq.Sequence(match.group(0))
                read2 = read.trim_left_end_with_quals(barcode)
                read2.write_to_fastq_file(outf)
                break
Esempio n. 7
0
def sequence_from_coords(fastafile, coords):
    """Fasta sequence from genome coords"""

    from pybedtools import BedTool
    chrom, start, end, strand = coords
    if not os.path.exists(fastafile):
        print('no such file')
        return
    try:
        if strand == '+':
            seq = str(BedTool.seq(coords, fastafile))
        else:  #reverse strand
            seq = str(BedTool.seq(coords, fastafile))
            seq = str(HTSeq.Sequence(seq).get_reverse_complement())
    except Exception as e:
        print(e)
        return
    return seq
Esempio n. 8
0
def trim_adapters(infile, adapter, outfile='cut.fastq', method='default'):
    """Trim adapters using cutadapt"""

    if not type(adapter) is str:
        print ('not valid adapter')
        return

    if method == 'default':
        newfile = open( outfile, "w" )
        fastfile = HTSeq.FastqReader(infile, "solexa")
        a = HTSeq.Sequence(adapter)
        for s in fastfile:
            new = s.trim_right_end(a, mismatch_prop = 0.)
            new.write_to_fastq_file( newfile )
        newfile.close()
    elif method == 'cutadapt':
        cmd = 'cutadapt -m 18 -O 5 -q 20 -a %s %s -o %s' %(adapter,infile,outfile)
        print (cmd)
        result = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
    return
Esempio n. 9
0
def dataframe_to_fasta(df, outfile='out.fa', seqkey='seq', idkey='id'):
    """Convert dataframe to fasta"""

    if idkey not in df.columns:
        df = df.reset_index()
    fastafile = open(outfile, "w")
    for i, row in df.iterrows():
        try:
            seq = row[seqkey]
        except:
            continue
        seq = seq.upper().replace('U', 'T')
        if idkey in row:
            d = str(row[idkey])
        else:
            d = row.name
        seq = seq.encode()
        myseq = HTSeq.Sequence(seq, str(d))
        myseq.write_to_fasta_file(fastafile)
    fastafile.close()
    return
Esempio n. 10
0
def match_read_primers(read1, read2, primer_sets):
    amplicon_assignment = defaultdict(lambda: 0.0)

    for amplicon in primer_sets:
        read1_primer_region = read1.seq[0:len(primer_sets[amplicon]['dlso'])]
        read2_primer_region = read2.seq[0:len(primer_sets[amplicon]['ulso'])]

        dlso = HTSeq.Sequence(primer_sets[amplicon]['dlso'], "DLSO")

        ratio1 = fuzz.ratio(read1_primer_region, dlso.get_reverse_complement())
        ratio2 = fuzz.ratio(read2_primer_region, primer_sets[amplicon]['ulso'])

        if ratio1 > amplicon_assignment['ratio1']:
            amplicon_assignment['ratio1_amplicon'] = amplicon
            amplicon_assignment['ratio1'] = ratio1
            amplicon_assignment['read1_primer_seq'] = read1_primer_region
        if ratio2 > amplicon_assignment['ratio2']:
            amplicon_assignment['ratio2_amplicon'] = amplicon
            amplicon_assignment['ratio2'] = ratio2
            amplicon_assignment['read2_primer_seq'] = read2_primer_region

    return amplicon_assignment
Esempio n. 11
0
def separate_demultiplex(libraries_dir, libname, libpath, lib2seq, sequences, seq2regex):
    fastq_file = HTSeq.FastqReader(libpath)
    seq_storage = collections.Counter()
    count = 0
    for barcode_num in lib2seq[libname]:
        seq_storage[barcode_num] = []
    for read in fastq_file:
        for sequence in sequences:
            if 'r' not in sequence:
                continue
            match = re.search(seq2regex[sequence], read.seq)
            if not match:
                continue
            count += 1
            barcode = HTSeq.Sequence(match.group(0))
            barcode_num = sequence[:2]
            read2 = read.trim_left_end_with_quals(barcode)
            seq_storage[barcode_num].append(read2)
            break
    for barcode_num in seq_storage.keys():
        with open("{}/{}/{}.filter-RNA.demulti.{}.fastq".format(libraries_dir, libname, libname, barcode_num), "w+") as outf:
            for read in seq_storage[barcode_num]:
                read.write_to_fastq_file(outf)
Esempio n. 12
0
def sequence_from_coords(fastafile, coords):
    """Fasta sequence from genome coords.
    Args:
        fastafile: inpout fasta file
        coords: genome coordinates as tuple of the form
                (chrom,start,end,strand)
    """

    if not os.path.exists(fastafile):
        print ('no such file')
        return
    chrom,start,end,strand = coords
    from pyfaidx import Fasta
    genes = Fasta(fastafile)
    try:
        #gets seq string from genome
        seq = str(genes[chrom][start:end])
        if strand == '-':
            seq = str(HTSeq.Sequence(seq.encode()).get_reverse_complement())
    except Exception as e:
        print (e)
        return
    return seq
Esempio n. 13
0
        transcript[feature.attr["Parent"]]['CDSfeats'].append(feature.iv)
        ## Future worry: do I need CDS.frame in transcript object?
        CDSfeat[feature.iv] = feature

print(
    "# Chrom\tPos\tPos in CDS\tBase change\tAA change\tAA pos in transcript\ttranscript ID"
)
vcfr = HTSeq.VCF_Reader(sys.argv[3])

for vc in vcfr:
    vCDS = CDSfeat[vc.pos]
    # vCDS.iv.start is base before 1st base of CDS
    if not vCDS == None and not vc.pos.start == vCDS.iv.start:
        vTranscript = transcript[vCDS.attr["Parent"]]
        refseq = str(
            HTSeq.Sequence(
                sequences[vCDS.iv.chrom].seq[vCDS.iv.start:vCDS.iv.end]))
        refseqT = ''.join(
            str(HTSeq.Sequence(sequences[CDS.chrom].seq[CDS.start:CDS.end]))
            for CDS in vTranscript['CDSfeats'])
        relpos = vc.pos.start - vCDS.iv.start
        # if variant is 1st base of CDS, relpos=1
        if refseq[relpos - 1] != vc.ref:
            print("ERROR: Reference Base not according to SNP")
        for alt in vc.alt:  # vc.alt is a list of alternative base(s)
            alternateSeq = refseq[0:relpos - 1] + alt + refseq[relpos:]
            altSeqT = ''
            relposT = 0
            exonFound = False
            transLen = 0
            for exon in vTranscript['CDSfeats']:
                transLen += exon.length
Esempio n. 14
0
def parse_matrix_scan(inFile):
    """
    Parses the RSAT matrix-scan output file and returns a list of dicts ordered by score    
    Keeps only one hit if 2 hits at the same position on both strands.
    Genomic coordinates are transfromed from 1-based (in matrix-scan output format) 
    into zero-based half open (like BED format) for internal representation and HTSeq compatibility.
    Assumes the "galaxy format" for sequence IDs in the first column.
    """

    # dict for unique sites
    unique_sites = {}

    # count number of total input sites
    n_sites = 0

    for line in open(inFile):

        # ignore comment lines
        if not line.startswith(';') and not line.startswith('#'):

            sp = line.strip().split('\t')
            loc = sp[0]

            chr = loc.split('_')[1]
            # the peak-coordinates are assumed now again ONE-based in matrix-scan output format!
            peak_start = int(loc.split('_')[2]) - 1

            # the motif coordinates are ONE-based and relative to peak start in the matrix-scan output file format
            start = peak_start + int(sp[4]) - 1
            end = peak_start + int(sp[5])

            strand = sp[3].replace('DR', '+').replace('D',
                                                      '+').replace('R', '-')
            score = float(sp[7])

            # one based locus coordinates of the motif:
            motif_loc = chr + ":" + str(start + 1) + "-" + str(end)

            # keep it only if score is greater than sits with same location
            if motif_loc not in unique_sites or score > unique_sites[
                    motif_loc]["score"]:

                type = sp[1]
                ft_name = sp[2]
                seq = HTSeq.Sequence(sp[6], loc)

                # append region as dict with all annotations
                unique_sites[motif_loc] = {"chr":chr, "start":start, "end":end, \
                    "strand":strand, "score":score, "type":type, \
                    "ft_name":ft_name, "motif_seq":seq, "motif_loc":motif_loc,
                    "seq_id":loc, "name":motif_loc}

            n_sites += 1  # increase counter for total number of sites

    # get list of sites sorted by motif score
    sorted_sites = sorted(unique_sites.values(),
                          cmp=lambda x, y: cmp(x['score'], y['score']),
                          reverse=True)

    print "INFO: Read {0} of {1} input regions.".format(
        len(sorted_sites), n_sites)

    return sorted_sites
Esempio n. 15
0
 * -i --input		fastq file
 * -b --basename	Three letter prefix which identifies a sample
output:
 * -o --output		fasta file
'''

import argparse
import HTSeq

parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', dest="input", required=True)
parser.add_argument('-b', '--basename', dest="basename", required=True)
parser.add_argument('-o', '--output', dest="output", required=True)
args = parser.parse_args()

uniques = {}
for s in HTSeq.FastqReader(args.input):
    if s.seq in uniques:
        uniques[s.seq] += 1
    else:
        uniques[s.seq] = 1

progressive_num = 1
with open(args.output, "w") as out:
    for sequence in uniques.keys():
        name = "{0}_{1}_x{2}".format(args.basename, progressive_num,
                                     uniques[sequence])
        fasta_read = HTSeq.Sequence(name=name, seq=sequence)
        progressive_num += 1
        fasta_read.write_to_fasta_file(out)