def main(fastq1, fastq2):
    with gzopen(fastq1) as f, gzopen(fastq2) as g:
        # Aggregate iterator of f,g iterators -> izip(f,g).
        for lineno, (line1, line2) in enumerate(izip(f, g)):
            # Take only sequence and quality on lines 1 and 3 (mod 4).
            modulo = lineno % 4
            if modulo == 1:
                valid = False
                # Split on "CATG" and take the first fragment.
                # In case there is no "CATG", the barcode will be rejected
                # for being too long.
                brcd = line1.rstrip().split('CATG')[0]
                if not min_brcd < len(brcd) < max_brcd: continue
                # Use a Levenshtein automaton to find the transpsoson
                # sequence. Genomic position starts next position (equal
                # to 0 in case there is no match).
                gpos = transposon.end(line2) + 1
                if not gpos: continue
                # Select the region from the end of the transposon to
                # the first "CATG", if any.
                genome = line2[gpos:].split('CATG')[0].rstrip()
                if len(genome) < min_genome: continue
                valid = True
            elif modulo == 3 and valid:
                qbrcd = score_from_quality(substr(line1, 0, len(brcd)))
                qgen = score_from_quality(substr(line2, gpos, len(genome)))
                sys.stdout.write('>%s:%d,%d\n%s\n' %
                                 (brcd, qbrcd, qgen, genome))
Example #2
0
def match_reads(file_name_1, file_name_2):
   outf = {}
   passed = False
   file1 = gzopen(file_name_1)
   file2 = gzopen(file_name_2)
   for (lineno, line1) in enumerate(file1):
      line2 = file2.readline()
      modulo = lineno % 4
      if modulo == 0 and passed:
         try:
            outf[index].write('@' + spotname + seq + '+\n' + quality)
         except KeyError:
            outf[index] = open(index + '.fastq', 'w')
            outf[index].write('@' + spotname + seq + '+\n' + quality)
      elif modulo == 1:
         index = line1[3:7]
         passed = test(line1[7:], line2)
         if passed:
            seq = line2[20:]
            spotname = line1[28:]
      elif modulo == 3 and passed:
         passed = testquality(line1[28:50], line2[20:50])
         quality = line2[20:]

   file1.close()
   file2.close()
   for key in outf: outf[key].close()
Example #3
0
def trimm_hic_reads(read1_fastq, read2_fastq):
    """This function trimms each read line at any uncut restriction enzyme site
   (GATC) and conserves the lefmost part. Then it output in fasta format. """

    # Open 2 files to write
    out1 = re.sub(r".fastq(\.gz)?", "read1.fasta", read1_fastq)
    out2 = re.sub(r".fastq(\.gz)?", "read2.fasta", read2_fastq)

    # Continue if files exist
    if os.path.exists(out1) & os.path.exists(out2):
        return [out1, out2]

    # We cut in enzyme restriction site GATC and make a fasta file
    with gzopen(read1_fastq) as f, gzopen(read2_fastq) as g, open(out1, "w") as y, open(out2, "w") as z:
        for lineno, (line1, line2) in enumerate(izip(f, g)):
            if lineno % 4 != 1:
                continue
            seq1 = line1.rstrip().split("GATC")[0]
            seq2 = line2.rstrip().split("GATC")[0]
            if len(seq1) > 16 and len(seq2) > 16:
                y.write(">%d\n" % (lineno / 4))
                y.write(seq1 + "\n")
                z.write(">%d\n" % (lineno / 4))
                z.write(seq2 + "\n")
    print([out1, out2])
    return [out1, out2]
Example #4
0
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2):
    """This function takes the 2 pair-end sequencing files and extracts the
    barcode making sure that the other read contains the transposon."""

    MIN_BRCD = 15
    MAX_BRCD = 25
    MIN_GENOME = 15

    # The known parts of the sequences are matched with a Levenshtein
    # automaton. On the reverse read, the end of the transposon
    # corresponds to a 34 bp sequence ending as shown below. We allow
    # up to 5 mismatches/indels. On the forward read, the only known
    # sequence is the CATG after the barcode, which is matched exactly.
    pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5)

    # Open a file to write
    fname_fasta = re.sub(r'[\_F][w\_].fastq(\.gz)?', 'iPCR.fasta',
                         fname_iPCR_PE1)

    # Substitution failed, append '.fasta' to avoid name collision.
    if fname_fasta == fname_iPCR_PE1:
        fname_fasta = fname_iPCR_PE1 + '.fasta'

    # Skip if file exists.
    if os.path.exists(fname_fasta):
        return fname_fasta

    with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \
            open(fname_fasta, 'w') as outf:
        # Aggregate iterator of f,g iterators -> izip(f,g).
        for lineno, (line1, line2) in enumerate(izip(f, g)):
            # Take sequence only.
            if lineno % 4 != 1:
                continue
            # Split on "CATG" and take the first fragment.
            # In case there is no "CATG", the barcode will be rejected
            # for being too long.
            brcd = line1.rstrip().split('CATG')[0]
            if not MIN_BRCD < len(brcd) < MAX_BRCD:
                continue
            # Use a Levenshtein automaton to find the transpsoson.
            genome = pT2.matchSuffix(line2, False)
            if not genome:
                continue
            # Select the region from the end of the transposon to
            # the first "CATG", if any.
            genome = genome.split('CATG')[0].rstrip()
            if len(genome) < MIN_GENOME:
                continue
            outf.write('>%s\n%s\n' % (brcd, genome))

    return fname_fasta
Example #5
0
    def parse(cls, fname1, fname2):
        '''Iterator that yields objects from a pair of fastq files.'''

        with gzopen(fname1) as f, gzopen(fname2) as g:
            for lineno, (line1, line2) in enumerate(izip(f, g)):
                if lineno % 4 == 1:
                    # Read the sequence.
                    read1 = line1.rstrip()
                    read2 = line2.rstrip()
                if lineno % 4 == 3:
                    # Read the quality and yield the object.
                    qual1 = line1.rstrip()
                    qual2 = line2.rstrip()
                    yield cls(read1, read2, qual1, qual2)
Example #6
0
def parse_fq(filename):
    """
    fastq parser
    """

    state = 0
    label = None
    qual = None

    with gzopen(filename) as f:
        for line in f:
            line = line.rstrip("\r\n")
            if len(line) == 0:
                continue

            # fastq
            if line[0] == "@":
                if label is not None:
                    yield label, "".join(seq), "".join(qual)
                state = 1
                label = line
                seq = []
                qual = []
                continue
            elif line[0] == "+":
                state = 2
                continue

            if state == 1:
                seq.append(line)
            elif state == 2:
                qual.append(line)

    if label is not None:
        yield label, "".join(seq), "".join(qual)
Example #7
0
def binit(bin_size, fname, limits=limits_hg19):

   getmap = itemgetter(0,2)
   unique_reads = set([])
   counts = defaultdict(int)

   with gzopen(fname) as f:
      for line in f:
         # Fields: read name, sequence, quality, map count, positions(s).
         item = line.rstrip().split('\t')
         # Keep only reads with unique map: "0:1" or "1+...".
         match_uniq = re.search(r'^[0:+]*1(?:\Z|\D)', item[3])
         if match_uniq:
            mapping = item[4]
            # 'mapping' is like "chr6:+:52132829:2C29".
            chrom,pos = getmap(mapping.split(':'))
            # Keep only on read (the first) of a read series.
            if (chrom,pos) in unique_reads: continue
            unique_reads.add((chrom,pos))
            counts[(chrom,int(pos)/bin_size)] += 1
         continue


   for chrom,size in sorted(limits_hg19.items()):
      for b in range(size/bin_size):
         sys.stdout.write("%s\t%d\t%d\t%d\n" % \
            (chrom, 1+b*bin_size, (1+b)*bin_size, counts[(chrom,b)]))
Example #8
0
def call_starcode_fastq_file(fastq):
    #pdb.set_trace()
    MIN_BRCD = 15
    MAX_BRCD = 25

    brcd_outfname = fname + '_barcodes.tsv'
    spk_outfname = fname + '_spikes.tsv'

    GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4)
    SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
    barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
    spike_tempf = tempfile.NamedTemporaryFile(delete=False)

    with gzopen(fastq) as f:
        outf = None
        for lineno, line in enumerate(f):
            if lineno % 4 != 1: continue
            hit = GFP.match(line)
            if hit is not None:
                outf = barcode_tempf
            else:
                hit = SPIKE.match(line)
                if hit is not None:
                    outf = spike_tempf
                else:
                    continue
            pos = hit.matchlist[0][0]
            if MIN_BRCD <= pos <= MAX_BRCD:
                outf.write(line[:pos] + '\n')
    barcode_tempf.close()
    spike_tempf.close()

    subprocess.call([
        'starcode',
        '-t4',
        '-i',
        barcode_tempf.name,
        '-o',
        brcd_outfname,
    ])

    subprocess.call([
        'starcode',
        '-t4',
        '-i',
        spike_tempf.name,
        '-o',
        spk_outfname,
    ])

    # Delete temporary files.
    os.unlink(barcode_tempf.name)
    os.unlink(spike_tempf.name)

    # Save the names of the files processsed
    #processed.append([brcd_outfname,spk_outfname])
    processed.append(brcd_outfname)
    spikessed.append(spk_outfname)
    #pdb.set_trace()
    return
Example #9
0
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2):
   """This function takes the 2 pair-end sequencing files and extracts the
   barcode making sure that the other read contains the transposon."""

   MIN_BRCD = 15
   MAX_BRCD = 25
   MIN_GENOME = 15

   # The known parts of the sequences are matched with a Levenshtein
   # automaton. On the reverse read, the end of the transposon
   # corresponds to a 34 bp sequence ending as shown below. We allow
   # up to 5 mismatches/indels. On the forward read, the only known
   # sequence is the CATG after the barcode, which is matched exactly.
   pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5)

   # Open a file to write
   fname_fasta = re.sub(r'read[1-2].fastq(\.gz)?', 'iPCR.fasta',
         fname_iPCR_PE1)
   # Substitution failed, append '.fasta' to avoid name collision.
   if fname_fasta == fname_iPCR_PE1:
      fname_fasta = fname_iPCR_PE1 + '.fasta'

   # Skip if file exists.
   if os.path.exists(fname_fasta): return fname_fasta
    
   with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \
      open(fname_fasta, 'w') as outf:
      # Aggregate iterator of f,g iterators -> izip(f,g).
      for lineno,(line1,line2) in enumerate(izip(f,g)):
         # Take sequence only.
         if lineno % 4 != 1: continue
         # Split on "CATG" and take the first fragment.
         # In case there is no "CATG", the barcode will be rejected
         # for being too long.
         brcd = line1.rstrip().split('CATG')[0]
         if not MIN_BRCD < len(brcd) < MAX_BRCD: continue
         # Use a Levenshtein automaton to find the transpsoson.
         genome = pT2.matchSuffix(line2, False)
         if not genome: continue
         # Select the region from the end of the transposon to
         # the first "CATG", if any.
         genome = genome.split('CATG')[0].rstrip()
         if len(genome) < MIN_GENOME: continue
         outf.write('>%s\n%s\n' % (brcd,genome))

   return fname_fasta
Example #10
0
def parse_tsv(fn, ichrom, ipos, chrmap, skip=1):
    with gzopen(fn) as f:
        while skip:
            skip -= 1
            next(f)
        for line in f:
            fields = line.split()
            key = (chrmap[fields[ichrom]], int(fields[ipos]))
            yield key, fields
Example #11
0
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2):
    """This function takes the 2 pair-end sequencing files and extracts the
    barcode making sure that the other read contains the transposon."""

    MIN_BRCD = 15
    MAX_BRCD = 25
    MIN_GENOME = 15

    # The known parts of the sequences are matched with a Levenshtein
    # automaton. On the reverse read, the end of the transposon
    # corresponds to a 34 bp sequence ending as shown below. We allow
    # up to 5 mismatches/indels. On the forward read, the only known
    # sequence is the CATG after the barcode, which is matched exactly.

    # Open a file to write
    fname_fasta = re.sub(r'[A-Za-z]+_iPCR_([\w]+)_[a-zA-Z0-9]+.fastq',
                         r'iPCR_\1.fasta',
                         fname_iPCR_PE1)

    # Substitution failed, append '.fasta' to avoid name collision.
    if fname_fasta == fname_iPCR_PE1:
        fname_fasta = fname_iPCR_PE1 + '.fasta'

    # Skip if file exists.
    if os.path.exists(fname_fasta):
        return fname_fasta

    with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \
            open(fname_fasta, 'w') as outf:
        # Aggregate iterator of f,g iterators -> izip(f,g).
        for lineno, (line1, line2) in enumerate(izip(f, g)):
            # Take sequence only.
            if lineno % 4 != 1:
                continue
            brcd = line1[:20]
            if not MIN_BRCD < len(brcd) < MAX_BRCD:
                continue
            # Lets relie on bwa mapping results to decide
            genome = line2.rstrip()
            if len(genome) < MIN_GENOME:
                continue
            outf.write('>%s\n%s\n' % (brcd, genome))

    return fname_fasta
Example #12
0
def binit(bin_size, input_file, limits=limits_hg19, output_dir='.'):
   """
   select the reads that are mapped uniquely
   tag the reads that mapped multiple times as repeats (NA)
   bin into certain window size
   """

   getmap = itemgetter(0,2)
   unique_maps = set()
   unique_counts = defaultdict(int)
   multiple_counts = defaultdict(int)

   with gzopen(input_file) as f:
      for line in f:
         # Fields: read name, sequence, quality, map count, positions(s).
         item = line.rstrip().split('\t')
         # keep only reads with unique map
         # the following scenarios are accepted
         # 1.. ; 0:0:...:0:1...; 0:0:0+1... 
         if item[4] == '-': continue
         stratum_size = int(re.search(r'^[0:+]*(\d+)', item[3]).groups()[0])
         #match_uniq = re.search(r'^[0:+]*1(?:\Z|\D)', item[3])
         thisdict = unique_counts if stratum_size == 1 else multiple_counts
         for hit in item[4].split(',')[:stratum_size]:
            #mapping = item[4]
            # 'mapping' is like "chr1:+:12942:34T1,chr15:-:102518193:34T1"
            #chrom,pos = getmap(mapping.split(':'))
            chrom,pos = getmap(hit.split(':'))
            # Keep only one read (the first) of a read series.
            if (chrom,pos) in unique_maps: continue
            unique_maps.add((chrom,pos))
            thisdict[(chrom,int(pos)/bin_size)] += 1

   #output file
   if not os.path.exists(output_dir):
      try:
         os.makedirs(output_dir)
      except OSError as exception:
         if exception.errno != errno.EEXIST:
            raise

   head, tail = os.path.split(input_file)
   base = os.path.splitext(tail)[0]
   output_fname="%sbin-%s.bed" %(bin_size,base)
   output_file = str(os.path.join(output_dir, output_fname))

   with open(output_file, 'w') as output_f:
      for chrom,size in sorted(limits_hg19.items()):
         for b in range(size/bin_size):
            coord = (chrom,b)
            mapping = "%s\t%d\t%d\t" % (chrom, 1+b*bin_size, (1+b)*bin_size)
            unmappable = multiple_counts[coord] and not unique_counts[coord]
            count = 'NA' if unmappable else str(unique_counts[(chrom,b)])
            output_f.write(mapping + count + '\n')
Example #13
0
def call_starcode_fastq_file(fastq):
   #pdb.set_trace()
   MIN_BRCD = 15
   MAX_BRCD = 25
      
   brcd_outfname = fname + '_barcodes.tsv'
   spk_outfname  = fname + '_spikes.tsv'
   
   GFP   = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4)
   SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
   barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
   spike_tempf   = tempfile.NamedTemporaryFile(delete=False)
   
   with gzopen(fastq) as f:
      outf = None
      for lineno,line in enumerate(f):
         if lineno % 4 != 1: continue
         hit = GFP.match(line)
         if hit is not None:
            outf = barcode_tempf
         else:
            hit = SPIKE.match(line)
            if hit is not None:
               outf = spike_tempf
            else:
               continue
         pos = hit.matchlist[0][0]
         if MIN_BRCD <= pos <= MAX_BRCD:
            outf.write(line[:pos] + '\n')
   barcode_tempf.close()
   spike_tempf.close()

   subprocess.call([
      'starcode',
      '-t4',
      '-i', barcode_tempf.name,
      '-o', brcd_outfname,])
      
   subprocess.call([
      'starcode',
      '-t4',
      '-i', spike_tempf.name,
      '-o', spk_outfname,])

   # Delete temporary files.
   os.unlink(barcode_tempf.name)
   os.unlink(spike_tempf.name)

   # Save the names of the files processsed
   #processed.append([brcd_outfname,spk_outfname])
   processed.append(brcd_outfname)
   spikessed.append(spk_outfname)
   #pdb.set_trace()
   return
Example #14
0
def parse_dbSNP_dump(fn):
    with gzopen(fn) as f:
        for line in f:
            fields = line.split()
            rs = fields[0]
            ss = fields[1]
            chrom = fields[2]
            pos = int(fields[3])
            ref = fields[4]
            alt = fields[5]
            yield rs, ss, chrom, pos, ref, alt
Example #15
0
def trimm_hic_reads(read1_fastq, read2_fastq):
    '''This function trimms each read line at any uncut restriction enzyme site
   (GATC) and conserves the lefmost part. Then it output in fasta format. '''

    # Open 2 files to write
    out1 = re.sub(r'.fastq(\.gz)?', 'read1.fasta', fname1)
    out2 = re.sub(r'.fastq(\.gz)?', 'read2.fasta', fname2)

    # We cut in enzyme restriction site GATC and make a fasta file
    with gzopen(read1_fastq) as f, gzopen(read2_fastq) as g, \
         open(out1,'w') as y, open(out2,'w') as z:
        for lineno, (line1, line2) in enumerate(izip(f, g)):
            if lineno % 4 != 1: continue
            seq1 = line1.rstrip().split('GATC')[0]
            seq2 = line2.rstrip().split('GATC')[0]
            if len(seq1) > 16 and len(seq2) > 16:
                y.write('>%d\n' % (lineno / 4))
                y.write(seq1 + '\n')
                z.write('>%d\n' % (lineno / 4))
                z.write(seq2 + '\n')
    return (out1, out2)
Example #16
0
def parse_methlist(filename):
    with gzopen(filename) as f:
        next(f)  # skip header
        # rintf("chrom\tpos-0\tpos-1\tstrand\tdepth\tC\tmC\tcontext\n");
        for line in f:
            line = line.rstrip()
            fields = line.split("\t")
            chrom = fields[0]
            start, end = map(int, fields[1:3])
            strand = fields[3]
            depth, C, mC = map(int, fields[4:7])
            context = fields[7]
            yield chrom, start, end, strand, depth, C, mC, context
Example #17
0
def parse_trusted(fn):
    rev = {"ADBR": True, "ARBD": False}
    with gzopen(fn) as f:
        for row in csv.reader(f):
            snp_num = int(row[0])
            chrom = row[1]
            if chrom == "30":
                chrom = "X"
            pos = int(row[2])
            ref = row[3]
            alt = row[4]
            key = row[5]
            yield snp_num, chrom, pos, ref, alt, rev[key]
Example #18
0
def file2ngram_info(infile, min_len, max_len):
    """ Given a d3_feats file, return a list of tab separated strings of the form:
    <ngram_length> <canonicalized term> <surface term> <doc_id> <pos_signature>
    e.g., 3       epitaxial silicon process       epitaxial silicon processes     000171485800006 JNN
    NOTE: All elements are returned as strings, including the <ngram_length>
    min_len and max_len constrain the length of ngrams to be included in output.
    """

    #print("[file2ngram_info] %s" % infile) ///
    s_infile = gzopen.gzopen(infile)
    # list of lists of info to be returned for each line of input file
    l_term_info = []
    for line in s_infile:
        line = line.strip("\n")
        l_fields = line.split("\t")
        filename = l_fields[0]
        doc_id = path_base_name(filename)
        term = l_fields[2]
        ngram_len = len(term.split(" "))

        # continue if conditions for the term are met (ngram length and filter check)
        if (ngram_len >= min_len) and (ngram_len <= max_len) and not(canon.illegal_phrase_p(term)) :

            canon_np = can.get_canon_np(term)
            # We assume that the last feature on the line is tag_sig!
            pos_sig = l_fields[-1]
            if pos_sig[:7] != "tag_sig":
                print ("[ngram_extract.py]Error: last feature on input line is not labeled tag_sig")
                print ("line: %s" % line)
                sys.exit()
            else:
                # replace pos_sig with a string made of the first char of each pos in the phrase
                # e.g. JJ_NN_NNS => JNN
                pos_sig = "".join(item[0] for item in pos_sig[8:].split("_"))

                prev_Npr = ""
                prev_N = ""
                # grab the prev_Npr feature, if there is one
                try:
                    # extract the value of the prev_Npr feature, if there is one.
                    match = re.search(r'prev_Npr=(\S+)	', line)
                    prev_Npr = match.group(1)
                    # canonicalize the noun
                    prev_N = can.get_canon_np(prev_Npr.split("_")[0])
                except:
                    pass

                l_term_info.append([str(ngram_len), canon_np, term, doc_id, pos_sig, prev_Npr, prev_N])

    s_infile.close()
    return(l_term_info)
Example #19
0
def parse_snp50_csv(fn):
    with gzopen(fn) as f:
        for row in csv.DictReader(f):
            snp_num = int(row["snp_number"])
            ss = row["ss_id"]
            rs = row["rs_id"]
            if rs:
                rs = "rs" + rs
            chrom = row["umd30_bta"]
            if chrom == "30":
                chrom = "X"
            pos = row["umd30_pos"]
            try:
                pos = int(pos)
            except ValueError:
                pass
            yield snp_num, ss, rs, chrom, pos
Example #20
0
def merge(fnamelist):
   flist = [gzopen(fname) for fname in fnamelist]

   # Get names and print header.
   idexpr = r'-(\d{3}[a-z]?)'
   IDs = [re.search(idexpr, fname).group(1) for fname in fnamelist]
   sys.stdout.write('seqname\tstart\tend\t' + '\t'.join(IDs) + '\n')

   # Iterate through all the files at the same time with 'izip'.
   for linetuple in izip(*flist):
      # Extract seqname, start and end from first file.
      mapping = '\t'.join(linetuple[0].split()[:3])
      # Extract 4-th column and print.
      entries = '\t'.join([line.split()[3] for line in linetuple])
      sys.stdout.write(mapping + '\t' + entries + '\n')

   for f in flist:
      f.close()
Example #21
0
def main(mapfile):
    with gzopen(mapfile) as f:
        for line in f:
            items = line.split()
            brcd = items[0].split(':')[0]
            # The character '-' at the end of the line indicates
            # that there is no hit for the sequence.
            if items[-1] == '-':
                brcd_counter[brcd][no_hits] += 1
                continue
            # In case that there are several hits, they will be
            # separated by ",".
            try:
                (loc, ) = items[-1].split(',')
            except ValueError:
                brcd_counter[brcd][many_hits] += 1
                continue
            (chrom, strand, pos, ignore) = loc.split(':')
            brcd_counter[brcd][(chrom, pos)] += 1
            pos_counter[(chrom, pos)][brcd] += 1

    # Find the Charlies.
    Charlies = set()
    for (pos, counts) in pos_counter.items():
        try:
            ((brcd1, alpha), (brcd2, beta)) = counts.most_common(2)
        except ValueError:
            continue
        if alpha < 8 * beta:
            Charlies.add(pos)
            print pos, counts

    print '----'

    # Find the Bobs.
    Bobs = set()
    for (brcd, counts) in brcd_counter.items():
        try:
            ((pos1, alpha), (pos2, beta)) = counts.most_common(2)
        except ValueError:
            ((pos1, alpha), ) = counts.most_common(1)
            beta = 0
        if alpha < 8 * beta: Bobs.add(brcd)
        elif pos1 not in position_unknown: print brcd, pos1, alpha
Example #22
0
def parse_snp_chip_txt(fn):
    data = {}
    lineno = 0
    with gzopen(fn) as f:
        lineno += 1
        hline = next(f).rstrip("\r\n")
        headers = hline.split("\t")[1:]
        for line in f:
            line = line.rstrip("\r\n")
            fields = line.split("\t")
            snp_num = int(fields[0])
            data[snp_num] = fields[1:]

            if len(data[snp_num]) != len(headers):
                raise ParseError(
                    "Error: {}: line {}: mismatch between number of headers ({}) and data array length ({})."
                    .format(filename, lineno, len(headers),
                            len(data[snp_num])))

    return headers, data
def get_Unmap(input_file, output_dir='.'):
   """ get the unmapped reads from the gemMap output """

   try:
      with gzopen(input_file) as in_f:
         gc.disable()
         reads_unmapped = []
         for line in in_f:
            items= line.rstrip().split('\t')
            if items[3] =="0:1":
               # collect all the fasqt elements according to the fastq format (ENCODE)
               # 1. the ids of the reads
               # 2. the read seq
               # 3. the strand + the ids
               # 4. the quality code
               # the unmapped regions of the gemMap are given '+' strand(assumption)
               fastq_items = []
               fastq_items.extend(["@"+items[0], items[1], "+"+items[0], items[2]])
               reads_unmapped.append(fastq_items)
         gc.enable()
   except:
      sys.stderr.write('file error:%s'%(os.path.splitext(input_file)[0]))
      raise

   # dump for the output...
   if not os.path.exists(output_dir):
      try:
         os.makedirs(output_dir)
      except OSError as exception:
         if exception.errno != errno.EEXIST:
            raise
   head, tail = os.path.split(input_file)
   base = os.path.splitext(tail)[0]
   output_fname_unmap ="unmapped-%s.fastq" %(base)
   output_file_unmap = str(os.path.join(output_dir, output_fname_unmap))

   with open(output_file_unmap, 'w') as output_f_unmap:
      for line in reads_unmapped:
         for items in line:
            output_f_unmap.write(items+'\n')
Example #24
0
def dir2features_count(filelist_file,
                       out_root,
                       sections,
                       year,
                       overwrite_p,
                       max_doc_terms_count=1000,
                       canonicalize_p=True,
                       filter_noise_p=True):
    #pdb.set_trace()
    out_path = "/".join([out_root, sections])
    out_path_prefix = "/".join([out_path, year])
    # term-feature output file
    tf_file = out_path_prefix + ".tf"
    # remember the mapping between surface head nouns and their canonicalized forms
    canon_file = out_path_prefix + ".canon"

    # create the outpath if it doesn't exist yet
    print("[act_tf.py]creating path: %s,\n[act_tf.py]writing to %s" %
          (out_path, tf_file))

    try:
        # create directory path for corpus, if it does not aleady exist
        os.makedirs(out_path)
    except:
        print("[act_tf.py]NOTE: Path already exists (or cannot be created).")

    # Do not continue if the .tf file already exists for this corpus and year
    if os.path.isfile(tf_file) and not overwrite_p:
        print "[tf.py]file already exists: %s.  No need to recompute." % tf_file
    else:

        terms_file = out_path_prefix + ".terms"
        feats_file = out_path_prefix + ".feats"
        corpus_size_file = out_path_prefix + ".cs"
        doc_terms_file = out_path_prefix + ".doc_terms"
        # store each filename with a list of its terms
        s_doc_terms_file = codecs.open(doc_terms_file, "w", encoding='utf-8')

        # count of number of docs a term pair cooccurs in
        # dfreq is document freq, cfreq is corpus freq
        #d_pair_freq = defaultdict(int)
        d_pair2dfreq = defaultdict(int)
        # corpus count for the pair
        d_pair2cfreq = defaultdict(int)
        # count of number of docs a term occurs in
        #d_term_freq = defaultdict(int)
        d_term2dfreq = defaultdict(int)
        # count of number of instances of a term
        #d_term_instance_freq = defaultdict(int)
        d_term2cfreq = defaultdict(int)
        # count of number of instances of a feature
        #d_feat_instance_freq = defaultdict(int)
        d_feat2cfreq = defaultdict(int)
        # count of number of docs a feature occurs in
        #d_feat_freq = defaultdict(int)
        d_feat2dfreq = defaultdict(int)

        # doc_count needed for computing probs
        doc_count = 0

        # open list of all the files in the inroot directory
        s_filelist = open(filelist_file)

        #print "inroot: %s, filelist: %s" % (inroot, filelist)

        # iterate through files in filelist
        for infile in s_filelist:
            infile = infile.strip("\n")

            # Create a tab separated string containing the filename and all (legal) canonicalized terms, including
            # duplicates.  This will be used to populate a doc_term retrieval system in
            # elasticsearch.
            # First field will be the filename.
            # At this point, we'll collect the filename and terms into a list.
            # The file without path or extensions should be a unique doc id.
            doc_id = os.path.basename(infile).split(".")[0]
            doc_terms_list = [doc_id]

            # dictionaries to sum up statistics
            # number of times a term appears in the doc
            d_term2count = defaultdict(int)
            d_feat2count = defaultdict(int)
            # number of times a term appears with a specific feature in the doc
            d_pair2count = defaultdict(int)

            # process the dictionaries
            # for each file, create a set of all term-feature pairs in the file
            #/// dictionaries are functionally redundant with sets here.
            # Use sets to capture which terms, features, and pairs occur in the
            # document.  We'll use this after processing each doc to update the
            # doc frequencies of terms, features, and pairs.
            pair_set = set()
            term_set = set()
            feature_set = set()
            #pdb.set_trace()

            s_infile = gzopen.gzopen(infile)
            # count number of lines in file
            i = 0

            # iterate through lines in d3_feats file
            for term_line in s_infile:
                i += 1
                term_line = term_line.strip("\n")
                l_fields = term_line.split("\t")

                term = l_fields[2]

                # Do not process noise (illegal) terms or features
                #  for cases where feat = "", need to filter!  todo
                #pdb.set_trace()
                if (filter_noise_p and canon.illegal_phrase_p(term)):
                    pass

                # eliminate lines that come from claims section of patents.
                # These are not very useful and skew term frequency counts.
                # We do this by eliminating lines containing the feature section_loc=CLAIM*.
                if ("=CLAIM" in term_line):
                    pass

                # NOTE: At the moment we don't test which sections of the doc should be included
                # as specified by the sections parameter (ta or tas).  We include every line.  If
                # we decide to add this functionality, this would be the place to add the filter.

                else:

                    if canonicalize_p:
                        # Do canonicalization of term before incrementing counts
                        #feature = can.get_canon_feature(feature)
                        term = can.get_canon_np(term)

                    # increment the within doc count for the term
                    ##d_term2count[term] += 1
                    term_set.add(term)
                    # increment the global corpus count for the term
                    d_term2cfreq[term] += 1

                    # Add the term to the list of terms for the current doc
                    # Ideally, we would like to ignore parts of a patent (e.g. the claims) and
                    # just use the title, abstract and summary.  However, there is no feature
                    # indicating what section we are in beyond the abstract.  So instead, we
                    # will use a simple doc_terms_count cut off (e.g. 1000). Variable i counts
                    # the number of lines so far.

                    #pdb.set_trace()
                    if (i <= max_doc_terms_count) and (
                            term not in DOC_TERMS_NOISE
                    ) and not canon.illegal_phrase_p(term):
                        doc_terms_list.append(term)

                    # fields 3 and beyond are feature-value pairs
                    # look for features of interest using their prefixes
                    for feature in l_fields[3:]:
                        # Note that we use the prefixes of some feature names for convenience.
                        # The actual features are prev_V, prev_VNP, prev_J, prev_Jpr, prev_Npr, last_word
                        # first_word, if an adjective, may capture some indicators of dimensions (high, low), although
                        # many common adjectives are excluded from the chunk and would be matched by prev_J.
                        # we also pull out the sent and token locations to allow us to locate the full sentence for this
                        # term-feature instance.
                        if (feature[0:6] in [
                                "prev_V", "prev_J", "prev_N", "last_w"
                        ]) and not canon.illegal_feature_p(feature):

                            if canonicalize_p and not "-" in feature:
                                # Do canonicalization of feature before incrementing counts.
                                # NOTE: There is a bug in the canonicalization code when the
                                # term contains hyphens. For example:
                                # >>> can.get_canon_feature("last_word=compass-on-a-chip")
                                # Returns a term with a blank in it: 'last_word=compas-on-a chip'
                                # for this reason, we will not try to canonicalize terms containing
                                # a hyphen.

                                feature = can.get_canon_feature(feature)

                            # increment global corpus count for the feature
                            d_feat2cfreq[feature] += 1

                            feature_set.add(feature)
                            # increment global corpus count for the pair
                            d_pair2cfreq[(term, feature)] += 1
                            # increment the within doc count for the term feature pair
                            ##d_pair2count[(term, feature)] += 1
                            pair_set.add((term, feature))

            # construct a tab-separated string containing file_name and all terms
            doc_terms_str = "\t".join(doc_terms_list)

            s_doc_terms_file.write("%s\n" % doc_terms_str)

            s_infile.close()

            # Using the sets, increment the doc_freq for term-feature pairs in the doc.
            # By making the list a set, we know we are only counting each term-feature combo once
            # per document
            for pair in pair_set:
                d_pair2dfreq[pair] += 1

            # also increment doc_freq for features and terms

            for term in term_set:
                d_term2dfreq[term] += 1

            for feature in feature_set:
                d_feat2dfreq[feature] += 1

            # track total number of docs
            doc_count += 1

        s_filelist.close()

        s_tf_file = codecs.open(tf_file, "w", encoding='utf-8')
        s_terms_file = codecs.open(terms_file, "w", encoding='utf-8')
        s_feats_file = codecs.open(feats_file, "w", encoding='utf-8')
        print "[act_tf.py]Writing to %s" % tf_file

        # compute prob
        print "[act_tf.py]Processed %i files" % doc_count

        for pair in d_pair2dfreq.keys():
            freq_pair = d_pair2dfreq[pair]
            prob_pair = float(freq_pair) / doc_count

            term = pair[0]

            feature = pair[1]
            freq_term = d_term2dfreq[term]
            freq_feat = d_feat2dfreq[feature]

            # Occasionally, we come across a term in freq_pair which is not actually in
            # the dictionary d_term2dfreq.  It returns a freq of 0.  We need to ignore these
            # cases, since they will create a divide by 0 error.
            if freq_term > 0 and freq_feat > 0:

                # probability of the feature occurring with the term in a doc, given that
                # the term appears in the doc
                try:
                    prob_fgt = freq_pair / float(freq_term)
                except:
                    pdb.set_trace()

                # added 4/4/15: prob of the feature occurring with the term in a doc, given that
                # the feature appears in the doc
                try:
                    prob_tgf = freq_pair / float(freq_feat)
                except:
                    pdb.set_trace()

                # 4/18/15 adding mutual information based on count of pairs, terms, feats (counted once per doc),
                # and corpus size (# docs)
                # MI = prob(pair) / prob(term) * prob(feature)
                #prob_term = float(d_term2dfreq[term])/doc_count
                #prob_feature = float(d_feat2dfreq[term])/doc_count
                mi_denom = (freq_term) * (freq_feat) / float(doc_count)
                mi = math.log(freq_pair / mi_denom)
                # normalize to -1 to 1
                # Note: if prob_pair == 1, then log is 0 and we risk dividing by 0
                # We'll prevent this by subtracting a small amt from prob_pair
                if prob_pair == 1:
                    prob_pair = prob_pair - .000000001
                npmi = mi / (-math.log(prob_pair))
                s_tf_file.write("%s\t%s\t%i\t%f\t%f\t%f\t%i\t%i\t%f\t%f\n" %
                                (term, feature, freq_pair, prob_pair, prob_fgt,
                                 prob_tgf, freq_term, freq_feat, mi, npmi))

            else:
                # print out a warning about terms with 0 freq.
                print "[act_tf.py]WARNING: term-feature pair: %s has freq = 0. Ignored." % l_pair

        for term in d_term2dfreq.keys():
            term_prob = float(d_term2dfreq[term]) / doc_count
            s_terms_file.write(
                "%s\t%i\t%i\t%f\n" %
                (term, d_term2dfreq[term], d_term2cfreq[term], term_prob))

        for feat in d_feat2dfreq.keys():
            feat_prob = float(d_feat2dfreq[feat]) / doc_count
            s_feats_file.write(
                "%s\t%i\t%i\t%f\n" %
                (feat, d_feat2dfreq[feat], d_feat2cfreq[feat], feat_prob))

        s_canon_file = codecs.open(canon_file, "w", encoding='utf-8')
        for key, value in can.d_n2canon.items():
            # Only write out a line if the canonical form differs from the surface form
            if key != value:
                s_canon_file.write("%s\t%s\n" % (key, value))
        s_canon_file.close()

        s_tf_file.close()
        s_terms_file.close()
        s_feats_file.close()

        s_doc_terms_file.close()

        # Finally, create a file to store the corpus size (# docs in the source directory)
        cmd = "ls -1 " + filelist_file + " | wc -l > " + corpus_size_file

        s_corpus_size_file = open(corpus_size_file, "w")
        s_corpus_size_file.write("%i\n" % doc_count)
        s_corpus_size_file.close()
        print "[act_tf.py dir2features_count]Storing corpus size in %s " % corpus_size_file
Example #25
0
import fileinput
import os
import pdb
import re
import seeq
import sys
import subprocess
import tempfile
from collections import defaultdict
from gzopen import gzopen
from itertools import izip

TOMAPfname = sys.argv[1] + '_2map'
#pdb.set_trace()
with gzopen(sys.argv[1]) as f, open(TOMAPfname,'w') as g:
   for lineno,line in enumerate(f):
      # Is a fastq keep only sequence
      if lineno % 4 != 1: continue
      # Exact search of NlaIII
      brcd = line.rstrip().split('CATG')[0]
      if len(brcd) == len(line.rstrip()): continue
      seq  = line.rstrip().split('CATG')[1]   
      # Cut if there is a MlucI site
      dna  = seq.split('AATT')[0]
      # Write fasta to map it 
      if not 10 < len(brcd) < 22 : continue
      if not 5 < len(dna): continue 
      g.write('>%s\n%s\n' % (brcd,dna))


# Map the sequences
Example #26
0
# Sets up arguments for user input
def setupParser():
    # Set up argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        action="store",
                        help="fastq.gz file you want to convert",
                        type=str)
    parser.add_argument("-o",
                        "--output",
                        action="store",
                        help="Name of the file you want the converted file.",
                        type=str)
    args = parser.parse_args()

    return (args)


# Code starts here
if __name__ == "__main__":
    args = setupParser()
    input_file, output_file, sample_index = parseArguments(args)

    with gzopen.gzopen(input_file) as input_fastq:
        with gzip.open(output_file, "wb") as output_fastq:
            for line in line_chunk:
                parsed_line = parseLine(line, sample_index=sample_index)
                output_fastq.write(parsed_line)

    print "Conversion of %s complete!" % input_file
        items = line.split()
        sgRNA = items[9]
        nreads = items[0].split('_')[-1]
        chrom = items[2]
        pos = int(items[3])
        if chrom not in dTree:
            # Do not consider alternate chromosomes
            # do not add them to 'dNread'.
            continue
        seen = set()
        for hit in dTree[chrom].query(pos):
            if hit.data in seen: continue
            # Every hit is a gene intersected by the position
            # of the gRNA (more specifically one exon of the
            # gene is intersected by the position of the gRNA).
            sys.stdout.write('%s\t%s\t%s\n' % (sgRNA, hit.data, nreads))
            seen.add(hit.data)


if __name__ == '__main__':
    sys.setrecursionlimit(10000)
    # Prepare transcript-gene lookup
    with gzopen(sys.argv[1]) as f:
        cast = read_dict(f)
    # Convert exon locations to interval tress for fast search.
    with gzopen(sys.argv[2]) as f:
        dTree = exons_to_interval_tree(f, cast)
    # Intersect read positions with exons.
    with gzopen(sys.argv[3]) as f:
        map_reads_to_exons(f, dTree)
Example #28
0
#! -*- coding:utf-8- -*-

import re
import sys
from gzopen import gzopen

with gzopen(sys.argv[1]) as f:
   for line in f:
      shit,score = line.rstrip('\n').split('\t')
      if int(score) < 11: continue
      pair = re.sub("[')(]", '', shit).replace(' ', '_').split(',_')
      sys.stdout.write('%s (u) %s = %s\n' % (pair[0], pair[1], score))
      #sys.stdout.write('%s u %s\n' % tuple(pair))
Example #29
0
def call_starcode_on_fastq_file(fname_fastq):
   ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.'''
   MIN_BRCD = 15
   MAX_BRCD = 25

   brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq)
   spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq)
   if brcd_outfname == fname_fastq:
      brcd_outfname = fname_fastq + '_starcode.txt'
   if spk_outfname == fname_fastq:
      spk_outfname = fname_fastq + '_spikes_starcode.txt'

   if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname):
      return (brcd_outfname, spk_outfname)

   GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4)
   SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
   barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
   spike_tempf = tempfile.NamedTemporaryFile(delete=False)
   with gzopen(fname_fastq) as f:
      outf = None
      for lineno,line in enumerate(f):
         if lineno % 4 != 1: continue
         hit = GFP.match(line)
         if hit is not None:
            outf = barcode_tempf
         else:
            hit = SPIKE.match(line)
            if hit is not None:
               outf = spike_tempf
            else:
               continue
         pos = hit.matchlist[0][0]
         if MIN_BRCD <= pos <= MAX_BRCD:
            outf.write(line[:pos] + '\n')
   barcode_tempf.close()
   spike_tempf.close()

   # Skip if file exists.
   if not os.path.exists(brcd_outfname):
      # Call `starcode`.
      subprocess.call([
         'starcode',
         '-t4',
         '-i', barcode_tempf.name,
         '-o', brcd_outfname,
      ])

   if not os.path.exists(spk_outfname):
      subprocess.call([
         'starcode',
         '-t4',
         '-i', spike_tempf.name,
         '-o', spk_outfname,
      ])

   # Delete temporary files.
   os.unlink(barcode_tempf.name)
   os.unlink(spike_tempf.name)

   return (brcd_outfname, spk_outfname)
Example #30
0
def parse_asn(fn, assembly):
    def subsplit(lst, tok="="):
        sdict = dict()
        for elm in lst:
            elm = elm.strip()
            subfields = elm.split(tok)
            if len(subfields) == 1:
                sdict[elm] = True
            elif len(subfields) == 2:
                sdict[subfields[0]] = subfields[1]
        return sdict

    rs = ss = chrom = pos = ref = alt = None
    indel = False

    with gzopen(fn) as f:
        # skip header
        next(f)
        next(f)
        next(f)
        for line in f:
            line = line.strip()

            if line == "":
                # next entry
                if None not in (rs, ss, chrom, pos):
                    yield rs, ss, chrom, pos, ref, alt
                rs = ss = chrom = pos = ref = alt = None
                indel = False
                continue

            if indel:
                continue

            fields = line.split(" | ")

            if fields[0].startswith("rs"):
                subfields = subsplit(fields[1:])
                if "snp" in subfields:
                    rs = fields[0]
                else:
                    indel = True

            elif fields[0].startswith("ss"):
                if ss is None:
                    ss = [fields[0]]
                else:
                    ss.append(fields[0])

            elif fields[0] == "SNP":
                subfields = subsplit(fields[1:])
                alleles = subfields.get("alleles")
                if alleles is not None:
                    if len(alleles) == 5:
                        # biallelic string, e.g. 'A/G'
                        ref, alt = alleles[1], alleles[3]

            elif fields[0] == "CTG" and chrom is None:
                subfields = subsplit(fields[1:])
                if subfields.get("assembly") == assembly:
                    chrom = subfields.get("chr")
                    pos = int(subfields.get("chr-pos", -1))

    # last entry
    if None not in (rs, ss, chrom, pos):
        yield rs, ss, chrom, pos, ref, alt
Example #31
0
def parse_col0(fn):
    with gzopen(fn) as f:
        for line in f:
            yield line.split(None, 1)[0]
Example #32
0
    if args.methylkit:
        ichrom = 1
        ipos = 2
    elif args.pileOmeth:
        ichrom = 0
        ipos = 1
    else:
        raise Exception("Are we doing methylkit or pileOmeth?")

    chrmap = {s: i for i, s in enumerate(parse_col0(args.chr_list))}
    f1 = parse_tsv(args.in1, ichrom, ipos, chrmap)
    f2 = parse_tsv(args.in2, ichrom, ipos, chrmap)

    l1 = l2 = oline = None

    with gzopen("/dev/stdout", "w", args.gzip) as fout:
        try:
            k1, l1 = next(f1)
            k2, l2 = next(f2)
            while True:
                if k1 < k2:
                    oline = l1
                    l1 = None
                    k1, l1 = next(f1)
                elif k2 < k1:
                    oline = l2
                    l2 = None
                    k2, l2 = next(f2)
                else:
                    if args.methylkit:
                        oline = l1[:4]
Example #33
0
def binit(ref_limits, bin_size, mismatch, input_file, output_dir='.'):
   """
   select the reads that are mapped uniquely, allowing three mismatches
   bin into certain window size

   """

   try:
      # import dict containing chromosomes size
      # limits = {'chr1': 1898309, 'chr2': 2902930, ...}
      limits =__import__("limits") #limits.py is the file containing limits dict 
      limits = limits.__dict__.get(ref_limits) #choose which species, e.g hg19 , mm10
      freq_table = defaultdict(int)
   except:
      raise

   try:
      f = gzopen(input_file)
      for line in f:
         # Fields: read name, sequence, quality, map count, positions(s).
         item = line.rstrip().split('\t')
         #mapping = item[4]
         if item[4] == '-': continue
         #check for mismatches
         # e.g. 0:0:1 = allowing 2 mismatches (2 zeros)
         no_mismatch = item[3].count("0")
         if no_mismatch <= mismatch:
            # 'mapping' is like "chr1:+:12942:34T1,chr15:-:102518193:34T1"
            getmap = item[4].split(":")
            chrom = getmap[0]
            start = getmap[2]
            freq_table[(chrom, int(start)/bin_size)] += 1
   except IndexError:
      sys.stderr.write("Check if your file is properly formatted, !field separator is a tab!")
   except:
      sys.stderr.write('file error:%s'%(in_f.name))
      raise
   finally:
      f.close()

   #collect all the aligned chromosome without duplicates 
   chroms = set([chrom for (chrom,bin) in freq_table.keys()])

   #start binning ...

   # remove the garbage collector during list append
   gc.disable()

   bin_list=[]
   for chrom in sorted(limits):
      chrom_size = int(limits[chrom])
      maxbin = int(chrom_size/bin_size)
      for bin in range(maxbin):
         bin_list.append("%s\t%s\t%s\t%d\n" % \
                        (chrom, 1+bin*bin_size,
                        (1+bin)*bin_size, freq_table[(chrom,bin)]))

   gc.enable()

   #finally output the file                                     
   if not os.path.exists(output_dir):
      try:
         os.makedirs(output_dir)
      except OSError as exception:
         raise

   head, tail = os.path.split(input_file)
   base = os.path.splitext(tail)[0]
   output_fname="%sbin-%s.bed" %(bin_size,base)
   output_file = str(os.path.join(output_dir, output_fname))

   with open(output_file, 'w') as output_f:
      for line in bin_list:
         output_f.write(line)
# -*- coding:utf-8 -*-

# This script takes 5'UTR regions of every Drosophila annotated gene in flybase to associate it with experimental modENCODE TSS's from 5' Race experiments

import re
import sys

from gzopen import gzopen

records = {}

# First we create the header of the file
sys.stdout.write('FBgnID\tchromosome\tstrand\tstartUTR\tendUTR\n')

# Open the Fasta file from Flybase with all the annotated 5'UTR's
with gzopen('dmel-all-five_prime_UTR-r5.52.fasta.gz') as f:
    for line in f:
        # We work with the Fasta header of each entry:
        # >FBtr0086024 type=five_prime_untranslated_region; loc=2R:complement(1946941..1947063);
        # name=CG7856-RA; MD5=0e29152561825b6636f4f7408d1ccfbb; length=123; parent=FBgn0033056; release=r5.52; species=Dmel;
        if line[0] != '>': continue
        (chrom, ori, start, end, parent) = re.search(
            r'loc=([^:]+):(join\(|complement\()?(\d+)\.[\d.,]*\.(\d+).*parent=(FBgn\d{7})',
            line).groups()
        strand = '-' if ori == 'complement(' else '+'
        # Keep only the shortest 5'UTR for a given start position.
        realstart = start if strand == '+' else end
        if records.has_key((parent, realstart)):
            record = records[(parent, realstart)]
            if int(end) - int(start) > int(record[4]) - int(record[3]):
                continue
Example #35
0
def vcf_parser(filename,
               yield_headers=False,
               yield_samples=True,
               yield_genotypes=True,
               parse_genotypes=True):
    """
    Parse vcf. Pyvcf does not meet my needs.  Yields specified vcf lines.

    Assumes the format is the same for every line.
    """

    samples = None

    with gzopen(filename) as f:

        # parse header
        for lineno, line in enumerate(f, 1):
            if yield_headers:
                yield line.rstrip("\r\n")
            if not line.startswith("#"):
                break
            if line.startswith("#CHROM"):
                # column header line
                # #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0 [sample1 ...]
                line = line.rstrip("\r\n")
                fields = line.split("\t")
                if len(fields) < 9:
                    raise ParseError(
                        "{}: line {}: expected at least 9 columns for vcf column header."
                        .format(filename, lineno))
                samples = fields[9:]
                break

        if yield_samples:
            if samples is None:
                raise ParseError(
                    "{}: no vcf column header found.".format(filename))
            yield samples

        if not yield_genotypes:
            return

        genotypes = collections.defaultdict(dict)
        fmt_fields = None

        # parse variants
        for line in f:
            lineno += 1
            line = line.rstrip("\r\n")
            fields = line.split("\t")

            chrom = fields[0]
            pos = int(fields[1])
            id = fields[2]
            ref = fields[3]
            alt = fields[4]
            qual = fields[5]
            filter = fields[6]
            info_str = fields[7]
            format_str = fields[8]
            genotype_str_list = fields[9:]

            if chrom.startswith("chr"):
                chrom = chrom[3:]

            if not parse_genotypes:
                yield (lineno, chrom, pos, id, ref, alt, qual, filter,
                       info_str, format_str, genotype_str_list)
                continue

            if fmt_fields is None:
                fmt_fields = format_str.split(":")

            for sample, genotype_str in zip(samples, genotype_str_list):
                gen_fields = genotype_str.split(":")
                for fmt, gen in zip(fmt_fields, gen_fields):
                    genotypes[sample][fmt] = gen

            yield (lineno, line, chrom, pos, id, ref, alt, qual, filter,
                   info_str, fmt_fields, genotypes)
Example #36
0

if __name__ == "__main__":
    args = parse_args()

    mk_files = [None, None, None]
    pm_files = [None, None, None]

    if args.gzip:
        suffix = ".gz"
    else:
        suffix = ""

    if args.methylkit:
        if args.cpg:
            mk_files[0] = gzopen(
                "{}.methylkit.CpG.txt{}".format(args.oprefix, suffix), "w")
        if args.chg:
            mk_files[1] = gzopen(
                "{}.methylkit.CHG.txt{}".format(args.oprefix, suffix), "w")
        if args.chh:
            mk_files[2] = gzopen(
                "{}.methylkit.CHH.txt{}".format(args.oprefix, suffix), "w")

        for f in mk_files:
            if f is not None:
                print("chrBase\tchr\tbase\tstrand\tcoverage\tfreqC\tfreqT",
                      file=f)

    if args.pileOmeth:
        if args.cpg:
            pm_files[0] = gzopen(
           fn_re.match(f)]
# The reads from the qMiseq come separated in 4 lanes. Merge them.

for sample in samples:
    # Ex. PromoterA12_S12_L001_R1_001.fastq.gz
    fn_items = sample.split('_')
    outfname = fn_items[0] + '.fastq'
    if os.path.isfile(outfname):
            sys.stdout.write("Sample: %s already processed, skiping.\n"
                             % outfname.split('/')[-1])
            continue

    lanes = ['L001','L002','L003','L004']
    for lane in lanes:
        if lane == 'L001':
            toextract = '_'.join([fn_items[0],fn_items[1],lane,
                                  fn_items[3],fn_items[4]])
            with gzopen(sample) as f, open(outfname,'w') as g:
                for line in f:
                    g.write(line)
        else:
            toextract = '_'.join([fn_items[0],fn_items[1],lane,
                                  fn_items[3],fn_items[4]])
            with gzopen(sample) as f, open(outfname,'a') as g:
                for line in f:
                    g.write(line)
                
        
        
    
Example #38
0
#!/usr/bin/python
# -*- coding:utf-8 -*-

import re
import sys
import tempfile
from gzopen import gzopen

BYTES = 65536

s = int(sys.argv[1])
pad = ''.join(['#']*36)

with tempfile.TemporaryFile() as temp:
   # Make a temp fasta file without newline on the sequence.
   with gzopen(sys.argv[2]) as f:
      txt = f.read(BYTES)
      while txt != '':
         while '>' in txt:
            txt += f.read(BYTES)
            header = re.search(r'\n?>[^\n]+\n', txt)
            temp.write(txt[:header.start()].replace('\n', ''))
            temp.write(txt[header.start():header.end()])
            txt = txt[header.end():]
         temp.write(txt.replace('\n', ''))
         txt = f.read(BYTES)

   # Reset temp file and read line by line.
   temp.seek(0)
   for line in temp:
      if line[0] == '>':
Example #39
0
import fileinput
import os
import pdb
import re
import seeq
import sys
import subprocess
import tempfile
from collections import defaultdict
from gzopen import gzopen
from itertools import izip

TOMAPfname = sys.argv[1] + '_2map'
#pdb.set_trace()
with gzopen(sys.argv[1]) as f, open(TOMAPfname, 'w') as g:
    for lineno, line in enumerate(f):
        # Is a fastq keep only sequence
        if lineno % 4 != 1: continue
        # Exact search of NlaIII
        brcd = line.rstrip().split('CATG')[0]
        if len(brcd) == len(line.rstrip()): continue
        seq = line.rstrip().split('CATG')[1]
        # Cut if there is a MlucI site
        dna = seq.split('AATT')[0]
        # Write fasta to map it
        if not 10 < len(brcd) < 22: continue
        if not 5 < len(dna): continue
        g.write('>%s\n%s\n' % (brcd, dna))

# Map the sequences
Example #40
0
    cst = seeq.compile(r'CGCACTAATGAATTCGTTGCA', 4)
    GATCGATC = seeq.compile(r'GATCGATC', 1)

    for line in f:
        # First remove the constant part, keep the left part
        # with oligo-specific nucleotides plus GATCGATC, and
        # keep the UMI on the right.
        try:
            oligo, ignore, umi = cst.match(line.rstrip()).tokenize()
            # Target length is 32. Allow at most 2 indels.
            if not 30 <= len(oligo) <= 34: continue
        except (ValueError, AttributeError):
            continue

        # Then split the oligo part to extract GATCGATC
        try:
            start, end, ignore = GATCGATC.match(oligo[10:22]).matchlist[0]
        except AttributeError:
            continue
        brcd = oligo[:10 + start] + oligo[10 + end:]
        readout = oligo[10 + start:10 + end]

        # Output fingerprint and GATCGATC
        fingerprint = brcd + 'AGATACAGAGATAATACA' + umi
        sys.stdout.write('%s\t%s\n' % (fingerprint, readout))


if __name__ == '__main__':
    with gzopen(sys.argv[1]) as f:
        extract_fingerprint_and_GATCGATC(f)
def Rvtests_bin(Sumstat, fasta):
    with gzopen(Sumstat
                ) as stats:  # call gzoopen to open file or file.zip on the fly
        OUT = open(str(Sumstat.split("/")[-1] + ".Tidy"), "w")  # output
        genome = pysam.Fastafile(
            fasta
        )  # load fasta file only needs to be done once as it writes the index
        count = 0  # number of variants
        count_mm = 0  #mismatches
        count_m = 0  #matches
        for line in stats:
            if line.startswith("##") or line.startswith(
                    "#G"
            ):  # rv tests has drivel at the start and the genomic lambda may be the last line
                continue
            elif line.startswith("CHROM") or line.startswith(
                    "#CHROM"
            ):  # This part will be used to inform how the transformations happen based on what package is used. ie different headers identify different packages
                check_tool(line)
                li_pos = line.strip().split("\t")
                li_pos.append("MAF_PH")
                li_pos.append("MAC_PH")
                li_pos.append("EAC_PH")
                li_pos.append("EAF_PH")
                li_pos[BETA] = "OR"
                li_pos.append("\n")
                OUT.write("\t".join(li_pos))  # write out
            else:
                li_pos = line.strip("\n").split("\t")
                bp_pos = li_pos[1]  #base position
                chr_n = li_pos[0]  #chr number
                stat_ref = li_pos[2]  # reference or non effect allele
                if ((chr_n.isdigit() and int(chr_n) == 23)
                        or chr_n == "X") and monomorphic_filter(
                            li_pos) is False:  # for the X chromosome.
                    chr_n = "X"  # rename chr23
                    gen_ref = genome.fetch(
                        chr_n,
                        int(bp_pos) - 1, int(bp_pos)
                    )  ## query the reference genome using pysam.
                    if stat_ref != gen_ref:  # output statistics for switched Ref/Alt
                        count_mm = count_mm + 1  # update mismatch counter
                        li_pos[2] = gen_ref  # swap references
                        li_pos[3] = stat_ref  # swap alternative
                        li_pos[BETA] = str(Beta2ORinvert(
                            li_pos, md5))  # Update BETA to OR
                        li_pos.append(MAF(li_pos))  # update MAF
                        li_pos.append(MAC(li_pos))  # update MAC
                        li_pos[N_ref], li_pos[N_alt] = li_pos[N_alt], li_pos[
                            N_ref]  # Swap genotype counts when misaligned Important to do this before EAC
                        li_pos.append(EAC(li_pos))  # update EAC
                        li_pos.append(str(
                            (1 - float(li_pos[AF]))))  # Update EAF
                        li_pos.append("\n")  # require
                        OUT.write(
                            "\t".join(li_pos))  # write to new sumstat file
                    elif stat_ref == gen_ref:  # Update Statistics where Ref/Alt non switched
                        li_pos.append(MAF(li_pos))
                        li_pos.append(MAC(li_pos))
                        li_pos.append(EAC(li_pos))
                        li_pos.append((str(float(li_pos[AF]))))
                        li_pos[BETA] = str(Beta2OR(li_pos, md5))  #BETA->OR
                        count_m = count_m + 1  # update matches
                        li_pos.append("\n")
                        OUT.write("\t".join(li_pos))
                    else:
                        continue
                elif chr_n.isdigit(
                ) and int(chr_n) > 23:  # for non autosomal & sex chromosomes
                    continue
                elif (chr_n.isdigit(
                ) and int(chr_n) < 23) and monomorphic_filter(
                        li_pos
                ) is False:  # This is the body of the computational part all autosomes
                    gen_ref = genome.fetch(chr_n,
                                           int(bp_pos) - 1,
                                           int(bp_pos))  # Query reg genome
                    if stat_ref != gen_ref:
                        count_mm = count_mm + 1
                        li_pos[2] = gen_ref  # swap references
                        li_pos[3] = stat_ref  # swap alternatives
                        li_pos[BETA] = str(Beta2ORinvert(li_pos, md5))
                        li_pos.append(MAF(li_pos))
                        li_pos.append(MAC(li_pos))
                        li_pos[N_ref], li_pos[N_alt] = li_pos[N_alt], li_pos[
                            N_ref]
                        li_pos.append(EAC(li_pos))
                        li_pos.append(str(1 - float(li_pos[AF])))
                        li_pos.append("\n")
                        OUT.write("\t".join(li_pos))
                    elif stat_ref == gen_ref:
                        li_pos.append(MAF(li_pos))
                        li_pos.append(MAC(li_pos))
                        li_pos.append(EAC(li_pos))
                        li_pos.append((str(float(li_pos[AF]))))
                        li_pos[BETA] = str(Beta2OR(li_pos, md5))
                        count_m = count_m + 1
                        li_pos.append("\n")
                        OUT.write("\t".join(li_pos))
                    else:
                        continue
                else:
                    continue
                count = count + 1
        OUT.close()
        return str(
            count_mm
        ) + " Mismatches: Reference allele is effect allele" + "\n" + str(
            count_m) + " Matches to reference" + "\n" + str(
                count) + " Sites checked"
Example #42
0
def translate(DNA):
    n = len(DNA)
    return ''.join([gcode.get(DNA[i:i + 3], '_') for i in range(0, n, 3)])


def main(f0, f1, f14):
    # Create one int dictionary per file.
    dict0 = defaultdict(int)
    dict1 = defaultdict(int)
    dict14 = defaultdict(int)

    for line in f0:
        DNA, count = line.split()
        dict0[DNA] = int(count)
    for line in f1:
        DNA, count = line.split()
        dict1[DNA] = int(count)
    for line in f14:
        DNA, count = line.split()
        dict14[DNA] = int(count)

    for DNA in set(dict0).union(dict1).union(dict14):
        print DNA, translate(DNA), dict0[DNA], dict1[DNA], dict14[DNA]


if __name__ == "__main__":
    with gzopen(sys.argv[1]) as f0, gzopen(sys.argv[2]) as f1, \
             gzopen(sys.argv[3]) as f14:
        main(f0, f1, f14)
def Rvtests_Quan(Sumstat, fasta):
    with gzopen(Sumstat) as stats:
        OUT = open(str(Sumstat.split("/")[-1] + ".Tidy"), "w")
        genome = pysam.Fastafile(
            fasta
        )  # load and index file only needs to be done once as it writes the index
        count = 0  # testing
        count_mm = 0  #mismatches
        count_m = 0  #matches
        for line in stats:
            if line.startswith("##") or line.startswith(
                    "#G"):  # rv tests has drivel at the start
                continue
            elif line.startswith("CHROM") or line.startswith(
                    "#CHROM"
            ):  # This part will be used to inform how the transformations happen based on what package is used. ie different headers identify different packages
                print check_tool(line)
                li_pos = line.strip("\n").split("\t")
                li_pos.append("MAF_PH")
                li_pos.append("MAC_PH")
                li_pos.append("EAC_PH")
                li_pos.append("EAF_PH")
                li_pos[BETA] = "BETA"
                li_pos.append("\n")
                OUT.write("\t".join(li_pos))
            else:
                li_pos = line.strip("\n").split("\t")
                bp_pos = li_pos[1]  #base position
                chr_n = li_pos[0]  #chr number
                stat_ref = li_pos[2]  # reference or non effect allele
                if (
                    (chr_n.isdigit() and int(chr_n) == 23) or chr_n == "X"
                ) and monomorphic_filter(
                        li_pos
                ) is False:  # for the X chromosome Currently using genotype counts to limit rows i.e if a variant is only homozygous for one allele the site is theoretically non informative, Can switch to using 1/2*number of samples this yields the theoretical MAF that can be observed given the data.
                    chr_n = "X"
                    gen_ref = genome.fetch(
                        chr_n,
                        int(bp_pos) - 1, int(bp_pos)
                    )  ## query the reference genome using pysam.
                    if stat_ref != gen_ref:  # output statistics for switched Ref/Alt
                        count_mm = count_mm + 1
                        li_pos[2] = gen_ref  # swap references
                        li_pos[3] = stat_ref  # swap alternative
                        li_pos[BETA] = float(li_pos[BETA]) * -1  # Update BETA
                        li_pos.append(MAF(li_pos))
                        li_pos.append(MAC(li_pos))
                        li_pos[N_alt], li_pos[N_ref] = li_pos[N_ref], li_pos[
                            N_alt]
                        li_pos.append(EAC(li_pos))  #update MAC
                        li_pos.append(str(
                            (1 - float(li_pos[AF]))))  # Update EAF
                        li_pos.append("\n")  # require
                        OUT.write(
                            "\t".join(li_pos))  # write to new sumstat file
                    elif stat_ref == gen_ref:  # Update Statistics where Ref/Alt non switched
                        li_pos.append(MAF(li_pos))
                        li_pos.append(MAC(li_pos))
                        li_pos.append(EAC(li_pos))
                        li_pos.append((str(float(li_pos[AF]))))
                        li_pos[BETA] = str(float(li_pos[BETA]))  #BETA->OR
                        count_m = count_m + 1
                        li_pos.append("\n")
                        OUT.write("\t".join(li_pos))
                    else:
                        continue
                elif chr_n.isdigit(
                ) and int(chr_n) > 23:  # for non autosomal & sex chromosomes
                    continue
                elif (chr_n.isdigit(
                ) and int(chr_n) < 23) and monomorphic_filter(
                        li_pos
                ) is False:  # This is the body of the computational part all autosomes
                    gen_ref = genome.fetch(chr_n,
                                           int(bp_pos) - 1,
                                           int(bp_pos))  # Query reg genome
                    if stat_ref != gen_ref:
                        count_mm = count_mm + 1
                        li_pos[2] = gen_ref  # swap references
                        li_pos[3] = stat_ref  # swap alternatives
                        li_pos[BETA] = str(float(li_pos[BETA]) * -1)  # BETA
                        li_pos.append(MAF(li_pos))
                        li_pos.append(MAC(li_pos))
                        li_pos[N_alt], li_pos[N_ref] = li_pos[N_ref], li_pos[
                            N_alt]
                        li_pos.append(EAC(li_pos))
                        li_pos.append(str(1 - float(li_pos[AF])))
                        li_pos.append("\n")
                        OUT.write("\t".join(li_pos))
                    elif stat_ref == gen_ref:
                        li_pos.append(MAF(li_pos))
                        li_pos.append(MAC(li_pos))
                        li_pos.append(EAC(li_pos))
                        li_pos.append((str(float(li_pos[AF]))))
                        li_pos[BETA] = str(float(li_pos[BETA]))  #BETA->OR
                        count_m = count_m + 1
                        li_pos.append("\n")
                        OUT.write("\t".join(li_pos))
                    else:
                        continue
                else:
                    continue
                count = count + 1
        OUT.close()
        return str(
            count_mm
        ) + " Reference mismatches: Reference allele is effect allele" + "\n" + str(
            count_m) + " Matches to reference genome" + "\n" + str(
                count) + " Sites checked"