Example #1
0
genes = {}

for h in headers:
   (seq,strand,start,end) = re.search(loc, h).groups()
   strand = '+' if strand is None else '-'
   (geneID,) = re.search(ID, h).groups()
   # Format the line for output, update dict 'genes'.
   genes[(seq, int(start))] = '\t'.join((
         geneID,
         seq,
         start,
         end,
         strand
      ))

table_header = '\t'.join((
      'geneID',
      'seqname',
      'start',
      'end',
      'strand'
   )) + '\n'

# Print the vheader.
sys.stdout.write(vheader(*sys.argv))
# Print a header.
sys.stdout.write(table_header)
# Sort lines by key, ie seqname, start.
for key in sorted(genes):
   sys.stdout.write('%s\n' % genes[key])
Example #2
0
def JSONtargets(mappingfile, bindingfile):
   """Create a gene target set in JSON format from a gene mapping
   file and a discrete binding profile."""
   # Read in gene mapping. Skip comment lines and remove stray
   # 'chr' sometimes present in chromosome names.
   mapping = [
         l.rstrip().replace('chr','').split('\t') \
         for l in vskip(open(mappingfile, 'r')) \
         if l[0] != '#'
      ]

   # Remove the header if present (recognized by 'start' and
   # 'end' in third and fourth columns.
   if mapping[0][2:4] == ['start','end']: mapping.pop(0)

   # Collect TSS, if gene is on +, TSS is on start, else on end.
   TSS = {}
   for row in mapping:
      thisTSS = {
        '+': lambda x: (x[1], int(x[2])), # 2nd and 3rd column.
        '-': lambda x: (x[1], int(x[3]))  # 2nd and 4th column.
      }.get(row[4])(row)
      # Arrange geneIDs by TSS in a dictionary.
      # Example: TSS['FBgn0031208'] = ('2L', 7529)
      TSS[row[0]] = thisTSS


   # Read in binding data. Skip comment lines and remove
   # 'chr' on chromosome names.
   binding = [
         l.rstrip().replace('chr','').split('\t') \
         for l in vskip(open(bindingfile, 'r')) \
         if l[0] != '#'
      ]
   # Get feature names and remove (pop) the header.
   # Example: features = ['D005', 'D007', ...]
   features = binding.pop(0)[4:]
   # "all" and "NA" are mutually exclusive lists of genes.
   targets = {'total': [], 'NA': []}
   for feature in features:
      targets[feature] = []


   # Collect mapping information (seqname, start, end) and
   # binding info (0/1).
   mapinfo = {}
   bindinfo = {}
   for row in binding:
      # Example: mapinfo['r5GATC2L00037'] = ('2L', 5301, 6026)
      mapinfo[row[0]] = (row[1], int(row[2]), int(row[3]))
      # Example: bindinfo['r5GATC2L00037'] = [0,0,1,...]
      bindinfo[row[0]] = row[4:]


   # Get the closest feature to TSS.
   close_elt = get_closest(TSS, mapinfo, dist = dist)


   for geneID in close_elt:
      if dist(TSS[geneID], mapinfo[close_elt[geneID]]) > MAXDIST:
         # The gene is too far. Push it to NA.
         targets.get('NA').append(geneID)
      else:
         targets.get('total').append(geneID)
         # The gene gets the status of the binding element closest
         # to its TSS.
         for feature in [
               feat for (feat, yes) in \
               # Example: [('D005', 0), ('D007', 0), ...]
               zip(features, bindinfo[close_elt[geneID]]) \
               if yes == '1'
            ]:
            targets.get(feature).append(geneID)


   # Print the version tracking header and the JSON data.
   sys.stdout.write(vheader(*sys.argv))
   json.dump(targets, sys.stdout, indent=4)
Example #3
0
def collect_integrations(fname_starcode_out, fname_mapped, *args):
   """This function reads the stacode output and changes all the barcodes
   mapped by their canonicals while it calculates the mapped distance
   rejecting multiple mapping integrations or unmmaped ones. It also
   counts the frequency that each barcode is found in the mapped data
   even for the non-mapping barcodes."""
   
   KEEP = frozenset([
      '2L', '2LHet', '2R', '2RHet', '3L', '3LHet',
      '3R', '3RHet', '4', 'X', 'XHet', 'U', 'Uextra',
      'dmel_mitochondrion_genome', 'pT2',
   ])

   fname_insertions_table = re.sub(r'\.map', '_insertions.txt',
          fname_mapped)
   # Substitution failed, append '_insertions.txt' to avoid name conflict.
   if fname_insertions_table == fname_mapped:
       fname_insertions_table = fname_mapped + '_insertions.txt'

   # Skip if file exists.
   if os.path.exists(fname_insertions_table): return

   def dist(intlist):
      intlist.sort()
      try:
         if intlist[0][0] != intlist[-1][0]: return float('inf')
         return intlist[-1][1] - intlist[0][1]
      except IndexError:
         return float('inf')
   
   canonical = dict()
   with open(fname_starcode_out) as f:
      for line in f:
         items = line.split()
         for brcd in items[2].split(','):
            canonical[brcd] = items[0]

   counts = defaultdict(lambda: defaultdict(int))
   with open(fname_mapped) as f:
      for line in f:
         items = line.split()
         try:
            barcode = canonical[items[0]]
         except KeyError:
            continue
         if items[3] == '-':
            position = ('', 0)
         else:
            pos = items[3].split(':')
            loc = int(pos[2]) if pos[1] == '+' else \
                  int(pos[2]) + len(items[1])
            position = (pos[0], loc, pos[1])
         counts[barcode][position] += 1
      
   integrations = dict()
   for brcd,hist in counts.items():
       total = sum(hist.values())
       top = [pos for pos,count in hist.items() \
             if count > max(1, 0.1*total)]
       # Skip barcode in case of disagreement between top votes.
       if dist(top) > 10: continue
       ins = max(hist, key=hist.get)
       integrations[brcd] = (ins, total)

   # Count reads from other files.
   reads = dict()
   # First item of tuple is barcode file, second is the spike's one
   for (fname,ignore) in args:
      reads[fname] = defaultdict(int)
      with open(fname) as f:
         for line in f:
            items = line.split('\t')
            try:
               reads[fname][items[0]] = int(items[1])
            except (IndexError, ValueError) as ex:
               raise FormatException("Input file with wrong format")
   with open(fname_insertions_table, 'w') as outf:
      outf.write(vheader(*sys.argv))
      unmapped = 0
      mapped = 0
      for brcd in sorted(integrations, key=lambda x: (integrations.get(x),x)):
         try:
            (chrom,pos,strand),total = integrations[brcd]
            if chrom not in KEEP: raise ValueError
         except ValueError:
            unmapped += 1
            continue
         mapped += 1
         outf.write('%s\t%s\t%s\t%d\t%d' % (brcd,chrom,strand,pos,total))
         for fname,ignore in args:
            outf.write('\t' + str(reads[fname][brcd]))
         outf.write('\n')

      # Now add the spikes if the experiment was spiked, otherwise continue.
      N = len(args)
      for i in range(N):
         (ignore,fname) = args[i]
         with open(fname) as f:
            for line in f:
               try:
                  items = line.rstrip().split('\t')
                  array = ['0'] * N
                  array[i] = items[1]
                  outf.write('%s\tspike\t*\t0\t0\t' % items[0])
                  outf.write('\t'.join(array) + '\n')
               except IndexError:
                  continue
   with open(LOGFNAME, 'a') as f:
      f.write('%s: mapped:%d, unmapped:%d\n' \
            % (fname_mapped, mapped, unmapped))
   return
Example #4
0
def collect_integrations(fname_starcode_out, fname_mapped, *args):
    """This function reads the starcode output and changes all the barcodes
    mapped by their canonicals while it calculates the mapped distance
    rejecting multiple mapping integrations or unmmaped ones. It also
    counts the frequency that each barcode is found in the mapped data
    even for the non-mapping barcodes."""

    fname_insertions_table = re.sub(r'\.sam', '_insertions.txt', fname_mapped)
    # Substitution failed, append '_insertions.txt' to avoid name conflict.
    if fname_insertions_table == fname_mapped:
        fname_insertions_table = fname_mapped + '_insertions.txt'

    # Skip if file exists.
    if os.path.exists(fname_insertions_table):
        return

    def dist(intlist):
        intlist.sort()
        try:
            if intlist[0][0] != intlist[-1][0]:
                return float('inf')
            return intlist[-1][1] - intlist[0][1]
        except IndexError:
            return float('inf')

    canonical = dict()
    with open(fname_starcode_out) as f:
        for line in f:
            items = line.split()
            for brcd in items[2].split(','):
                canonical[brcd] = items[0]

    counts = defaultdict(lambda: defaultdict(int))
    ISREV = 0b10000
    with open(fname_mapped) as f:
        for line in f:
            if line[0] == '@':
                continue
            items = line.split()
            try:
                barcode = canonical[items[0]]
            except KeyError:
                continue
            if items[2] == '*':
                position = ('', 0)
            else:
                # GTTACATCGGTTAATAGATA 16  2L  9743332 60  9S32M [...]
                strand = '-' if int(items[1]) & ISREV else '+'
                chrom = items[2]
                pos = int(items[3])
                position = (chrom, pos, strand)
                counts[barcode][position] += 1

    integrations = dict()
    for brcd, hist in counts.items():
        total = sum(hist.values())
        top = [
            loc for loc, count in hist.items() if count > max(1, 0.1 * total)
        ]
        # Skip barcode in case of disagreement between top votes.
        if dist(top) > 10:
            continue
        ins = max(hist, key=hist.get)
        integrations[brcd] = (ins, total)

    # Count reads from other files.
    reads = dict()
    # First item of tuple is barcode file, second is the spike's one
    for (fname, ignore) in args:
        reads[fname] = defaultdict(int)
        with open(fname) as f:
            for line in f:
                items = line.split('\t')
                try:
                    reads[fname][items[0]] = int(items[1])
                except (IndexError, ValueError) as ex:
                    raise FormatException("Input file with wrong format")
    with open(fname_insertions_table, 'w') as outf:
        outf.write(vheader(*sys.argv))
        unmapped = 0
        mapped = 0
        for brcd in sorted(integrations,
                           key=lambda x: (integrations.get(x), x)):
            (chrom, pos, strand), total = integrations[brcd]
            mapped += 1
            outf.write('%s\t%s\t%s\t%d\t%d' %
                       (brcd, chrom, strand, pos, total))
        for fname, ignore in args:
            outf.write('\t' + str(reads[fname][brcd]))
            outf.write('\n')

        # Now add the spikes if the experiment was spiked, otherwise continue.
        N = len(args)
        for i in range(N):
            (ignore, fname) = args[i]
            with open(fname) as f:
                for line in f:
                    try:
                        items = line.rstrip().split('\t')
                        array = ['0'] * N
                        array[i] = items[1]
                        outf.write('%s\tspike\t*\t0\t0\t' % items[0])
                        outf.write('\t'.join(array) + '\n')
                    except IndexError:
                        continue
    with open(LOGFNAME, 'a') as f:
        f.write('%s: mapped:%d, unmapped:%d\n' %
                (fname_mapped, mapped, unmapped))
    return