Example #1
0
class TestVariablePairs(unittest.TestCase):
  """Verify that MINEResults correctly reports missing or extraneous variables."""

  def setUp(self):
    self.fp = StringIO(SAMPLE_MIC)
    self.err = StringIO()

  def spin(self, var_list):
    """Create a MINEResults object, read all lines, return err buffer. 

    Returns:
      str: of resulting err buffer contents
    """
    m = top.MINEResults(self.fp, var_list=var_list, err=self.err)
    for row in m.get_rows(): pass
    return self.err.getvalue()

  def test_all_varpairs_match(self):
    """Verify no duplicate pairs of variables."""
    # All variables present.
    msg = self.spin(var_list=VARS[:])
    self.assertEqual(msg, "")

  def test_extra_variable_in_list(self):
    msg = self.spin(VARS[:] + ['abc'])
    self.assertEqual(msg, EXTRA_EXPECTED)

  def test_extra_mine_variable_not_in_list(self):
    msg = self.spin(VARS[:-1])
    self.assertEqual(len(VARS[:-1]), 3)
    self.assertEqual(msg, EXTRA_VAR_WARNINGS)

  def test_extra_mine_variable_not_in_list_extra(self):
    m = top.MINEResults(self.fp, var_list=VARS[:-1],err=self.err)
    for row in m.get_rows(): pass
    msg = self.err.getvalue()
    # verify all vars in varpairs
    for key in VARS[:-1]:
      self.assertTrue(key in m.var_pairs.vars)
    self.assertEqual(m.var_pairs.n_set, sum(m.var_pairs.V))
    self.assertEqual(m.var_pairs.n, 3)
    # Do not store diagonal, so number of unique pairs is n choose 2,
    #   not sum i^2 to n
    self.assertEqual(m.var_pairs.n_max_pairs, 3)
    self.assertEqual(len(VARS[:-1]), 3)
    self.assertEqual(msg, EXTRA_VAR_WARNINGS)

  def test_all_var_but_missing_pair(self):
    # Skip first two lines.
    self.fp.next()
    self.fp.next()
    msg = self.spin(VARS[:])
    self.assertEqual(msg, MISSING_PAIR_WARNING)
Example #2
0
class FlickrFile(object):

    """A file-like object representing a file on flickr. Caches with a StringIO object"""

    def __init__(self, imageid, name, data):
        self.imageid = imageid
        self.name = name
        self.stringio = StringIO(data)
        self.closed = False
        self.newlines = ('\r', '\n', '\r\n')
        self.flush()

    def close(self):
        self.flush()
        self.closed = True

    def _stringio_get_data(self):
        old_seek = self.stringio.tell()
        self.stringio.seek(0)
        data = self.stringio.read()
        self.stringio.seek(old_seek)
        return data

    def flush(self):
        with tempfile.NamedTemporaryFile() as tf:
            data_to_png(self._stringio_get_data()).save(tf, 'png')
            if self.imageid:
                flickr.replace(filename=tf.name, photo_id=self.imageid, title=self.name, description=str(len(data)), format='bs4')
            else:
                self.imageid = flickr.upload(filename=tf.name, title=self.name, description=str(len(data)), format='bs4').photoid.text

    def iter(self):
        return self

    def next(self):
        return self.stringio.next()

    def read(self, size=-1):
        return self.stringio.read(size)

    def readline(self, size=-1):
        return self.stringio.read(size)

    def seek(self, offset, whence=0):
        return self.stringio.seek(offset, whence)

    def tell(self):
        return self.stringio.tell()

    def truncate(self, size=0):
        return self.stringio.truncate(size)

    def write(self, data):
        return self.stringio.write(data)

    def writelines(self, seq):
        return self.stringio.writelines(seq)
Example #3
0
 def testSample(self):
     mockInput = MagicMock(return_value=StringIO(self.sampleInput))
     output = StringIO()
     mockOutput = MagicMock(return_value=output)
     with patch("__builtin__.open", mockInput):
         inputFile = FileRecordStream("input_path")
         with patch("__builtin__.open", mockOutput):
             outputFile = FileRecordStream("output_path",
                                           fields=inputFile.getFields(),
                                           write=True)
             anomalyzer.sample(inputFile, outputFile, 1)
     result = StringIO(output.getvalue())
     result.next()
     result.next()
     result.next()
     reader = csv.reader(result)
     _, value = reader.next()
     self.assertIn(int(value), (1, 2, 3, 4, 5, 6))
     self.assertRaises(StopIteration, result.next)
Example #4
0
 def testSample(self):
   mockInput = MagicMock(return_value=StringIO(self.sampleInput))
   output = StringIO()
   mockOutput = MagicMock(return_value=output)
   with patch("__builtin__.open", mockInput):
     inputFile = FileRecordStream("input_path")
     with patch("__builtin__.open", mockOutput):
       outputFile = FileRecordStream("output_path",
                                     fields=inputFile.getFields(),
                                     write=True)
       anomalyzer.sample(inputFile, outputFile, 1)
   result = StringIO(output.getvalue())
   result.next()
   result.next()
   result.next()
   reader = csv.reader(result)
   _, value = reader.next()
   self.assertIn(int(value), (1, 2, 3, 4, 5, 6))
   self.assertRaises(StopIteration, result.next)
Example #5
0
def get_discrete_data(sites):
    '''
    Gets the time series of discrete discharge field measurements
    
    Keyword Arguments:
    sites: site id to be queried
    
    Returns: Panda Dataframe of Time, Timezone, and Discharge
    '''

    params = {'site_no': sites, 'agency_cd': 'USGS', 'format': 'rdb_expanded'}

    #query NWIS
    r = requests.get('https://waterdata.usgs.gov/nwis/measurements/',
                     params=params)

    #Get first entry of data
    match = re.search('\nUSGS\t', r.text)
    match.span()[0]

    #save string to string buffer
    string_io = StringIO(r.text[match.span()[0]:])
    string_io.seek(0)
    string_io.next()

    #read in table and adjust timezone to UTC
    discrete_df = pd.read_table(string_io, header=None, engine='c', sep='\t')
    discrete_df[3] = pd.to_datetime(discrete_df[3])
    discrete_df[3] = [
        discrete_df[3].values[x] - uc.translate_tz(y) for x, y in zip(
            range(0, len(discrete_df[3].values)), discrete_df[4].values)
    ]

    discrete_df = discrete_df[[3, 4, 9]]
    discrete_df.columns = [["Time", "Tz", "Q"]]

    return discrete_df
Example #6
0
def process_phylip_dist_mat(dist_data, trans_names):
    """Converts the output of make_phylip_seq_dist_mat into a format
    appropriate to use in downstream analysis.

    dist_data       A string of the dist_data as returned from the
                    phylip protdist and dnadist programs.
    trans_names     A dict mapping the translated names used in the
                    phylip alignment format with the original names.

    Returns:
    dmat -- A dict() keyed by all pairs of sequences with the genetic distance
            as the value. This can be used in the check_distance_pvals
            function.
    """

    rev_dict = dict((val, key) for key, val in trans_names.items())
    handle = StringIO(dist_data)
    handle.next()
    dist_groups = []

    for line in handle:
        if line.startswith('Seq-'):
            dist_groups.append('')
        dist_groups[-1] += line

    omat = {}
    tmpl = 'Seq-%i'
    for seq_num, group in enumerate(dist_groups):
        parts = group.split()[1:]
        for onum, val in enumerate(parts):
            nkey = (rev_dict[tmpl % seq_num], rev_dict[tmpl % onum])
            nval = float(val)
            if nval >= 0:
                omat[nkey] = nval

    return omat
def url_chunker(url, chunksize=1024):
    """Returns an iterator over contents of a file
        *Params*
        #file - an open FILE object
        #chunksize - how many lines to read at once?
    """
    #url=book[0]
    #bookname=book[1]
    
    user_agent = {'User-agent': 'Mozilla/5.0'}
    result=requests.get(url,headers=user_agent)
    
    try:
        doc = result.content
    except:
        raise Exception("URL "+url+"not responding")
    
    text_in=StringIO(doc)
    chunks = []
    stop = False
    while not stop:
        text=""
        for x in range(chunksize):
            try:
                text+=text_in.next()
            except StopIteration:
                chunks.append(text)
                stop=True
                break
                
        chunks.append(text)
        
    jobids = cloud.map(wordcount, [(url,c) for c in chunks])
    cloud.join(jobids,deadlock_check=False)
    results = cloud.result(jobids)
    
    index=reduce_results(results)
    
    mongo_insert(index)
    
    return "OK"
def url_chunker(url, chunksize=1024):
    """Returns an iterator over contents of a file
        *Params*
        #file - an open FILE object
        #chunksize - how many lines to read at once?
    """
    #url=book[0]
    #bookname=book[1]

    user_agent = {'User-agent': 'Mozilla/5.0'}
    result = requests.get(url, headers=user_agent)

    try:
        doc = result.content
    except:
        raise Exception("URL " + url + "not responding")

    text_in = StringIO(doc)
    chunks = []
    stop = False
    while not stop:
        text = ""
        for x in range(chunksize):
            try:
                text += text_in.next()
            except StopIteration:
                chunks.append(text)
                stop = True
                break

        chunks.append(text)

    jobids = cloud.map(wordcount, [(url, c) for c in chunks])
    cloud.join(jobids, deadlock_check=False)
    results = cloud.result(jobids)

    index = reduce_results(results)

    mongo_insert(index)

    return "OK"
Example #9
0
def parse_problem(f, filtr=None):
	def parse(row):
		raw = row.split()
		foo = dict([(hdr, types[hdr](raw[idx])) for idx,hdr in enumerate(headers)])
		return foo
	def parse_setting(line):
		return [field.strip(':') for field in line.split()[1:]]

	print "Parsing", f
	settings = StringIO()
	settings.writelines(line for line in open(f, 'r') if line.startswith("#"))
	settings.seek(0)
	settings = dict([parse_setting(line) for line in settings])

	if filtr is not None and not filtr(settings):
		return None

	data = StringIO()
	data.writelines(line for line in open(f, 'r') if not line.startswith("#"))
	data.seek(0)
	headers = data.next().split()
	return settings, [parse(row) for row in data]
        print orig_name, new_name
        node = out_tree.find_node_with_taxon_label(new_name)
        if node:
            node.taxon = orig_name
    return out_tree

# <codecell>

names, dmat = make_phylip_seq_dist_mat(test_seqs[0:200].to_dict().items(), generic_protein)

# <codecell>

names.items()[:5]
rev_dict = dict((val, key) for key, val in names.items())
handle = StringIO(dmat)
nseqs = int(handle.next())
dist_groups = []

for line in handle:
    if line.startswith('Seq-'):
        dist_groups.append('')
    dist_groups[-1] += line

omat = {}
tmpl = 'Seq-%i'
for seq_num, group in enumerate(dist_groups[:5]):
    parts = group.split()[1:]
    for onum, val in enumerate(parts):
        nkey = (rev_dict[tmpl % seq_num], rev_dict[tmpl % onum])
        nval = float(val)
        if nval >= 0:
Example #11
0
	def Make_Repeat_Mask_Txt( self, word_size=17, gapopen=5, e_thresh=0.0001, perc_identity=90, gapextend=2,
	                          min_length=75 ):
		"""
        Run blastn on contigs in input fasta file against database dbname. Parameters set to NCBI recommended defaults for blastn.
        """
		outfastapath = os.path.join(
			self.outdir, '{0}.fasta'.format(self.newrefid))
		prefix = os.path.join(self.outdir, self.newrefid)
		maskpath = prefix + '_repmask.array'
		regionspath = prefix + '_repregions.array'
		statspath = prefix + '.stats'
		
		blastn_cline = blastn(cmd=COMPASSCFG['tools']['blast']['path'] + "blastn", db=prefix, query=outfastapath,
		                      dust='no', word_size=word_size, gapopen=gapopen, gapextend=gapextend, evalue=e_thresh,
		                      perc_identity=perc_identity,
		                      outfmt='"6 qseqid sseqid pident length qstart qend sstart send"')
		try:
			blast_out, blast_err = blastn_cline()
			assert not blast_err
		except (AppError, AssertionError) as err:
			raise Exception(
				'Erro: Blast failed during construction of repeat mask : {0}'.format(err))
		
		repmask_fp = open(maskpath, 'w')
		repregions_fp = open(regionspath, 'w')
		total_bp = 0
		repetitive_bp = 0
		num_regions = 0
		
		# each blast_rec is result from one query sequence (contig)
		blast_stream = StringIO(blast_out)
		prev_header = None
		for contig_count, contig in enumerate(SeqIO.parse(outfastapath, 'fasta'), 1):
			if prev_header != contig.name:
				repregions_fp.write('>{0}\n'.format(contig.name))
				prev_header = contig.name
			total_bp += len(contig)
			repmask = np.zeros(len(contig), dtype=np.bool)
			try:
				fields = blast_stream.next().split()
			except StopIteration:
				fields = None
			while fields and fields[0] == contig.name:
				contig_name, match_name = fields[:2]
				hit_perc_ident = float(fields[2])
				hit_length, q_start, q_end, s_start, s_end = (
					int(x) for x in fields[3:])
				(x1, y1), (x2, y2) = sorted(
					((q_start, q_end), sorted((s_start, s_end))))
				if hit_length >= min_length and (contig_name != match_name or not (x2 <= x1 <= y2 and x2 <= y1 <= y2)):
					repmask[q_start - 1:q_end] = True
				try:
					fields = blast_stream.next().split()
				except StopIteration:  # end of blast hits
					fields = None
			# output.bam repmask as 1 and 0, 100 per line
			repmask_fp.write('>{0}\n'.format(contig.name))
			for i in xrange(0, len(repmask), 100):
				j = min(i + 100, len(repmask))
				repmask_fp.write('{0}\n'.format(''.join(str(i)
				                                        for i in repmask[i:j].astype(int))))
			# identify postitions of repetitive regions (runs of 1s in the
			# repmask array)
			# 0-based numbering
			region_starts = list(np.where(repmask[1:] > repmask[:-1])[0] + 1)
			region_ends = list(np.where(repmask[1:] < repmask[:-1])[0] + 1)
			# special case: full blast hit for this contig against another
			# contig
			if repmask.all():
				region_starts = [0]
				region_ends = [len(repmask)]
			# fix ends, in case regions start from the first position in the
			# sequence or end at the last
			if region_starts and ((not region_ends) or (region_starts[-1] > region_ends[-1])):
				region_ends.append(len(repmask))
			if region_ends and ((not region_starts) or (region_starts[0] > region_ends[0])):
				region_starts = [0] + region_starts
			repregions_fp.writelines('{0}\t{1}\n'.format(
				rs, re) for rs, re in izip(region_starts, region_ends))
			repetitive_bp += repmask.sum()
			num_regions += len(region_starts)
		
		repmask_fp.close()
		repregions_fp.close()
		pct_repetitive = '{0:.2f}'.format(
			(float(repetitive_bp) / total_bp) * 100)
		LE.debug(
			'Info: Repetitive regions for all of {0}: {1}/{2} bp ({3}%)'.format(self.newrefid, repetitive_bp, total_bp,
			                                                                    pct_repetitive))
		
		# save result summary
		statsvalues = '\t'.join((self.newrefid, self.newrefid, str(contig_count), str(total_bp), str(repetitive_bp),
		                         str(num_regions), pct_repetitive))
		with open(statspath, 'w') as o:
			o.write('refid\trefcd\tcontigs\tnumbp\trepetitivebp\trepregions\trepetitivepct\n{values}\n'.format(
				values=statsvalues))
		return
Example #12
0
    def __init__(self, content):
        """
        Parse WOUDC Extended CSV into internal data structure

        :param content: string buffer of content
        """

        header_fields = [
            "PROFILE",
            "DAILY",
            "GLOBAL",
            "DIFFUSE",
            # 'MONTHLY',
            "OZONE_PROFILE",
            "N14_VALUES",
            "C_PROFILE",
            "OBSERVATIONS",
            "PUMP_CORRECTION",
            "SIMULTANEOUS",
            "DAILY_SUMMARY",
            "DAILY_SUMMARY_NSF",
            "SAOZ_DATA_V2",
            "GLOBAL_DAILY_TOTALS",
        ]

        self.sections = {}
        self.metadata_tables = []
        self.data_tables = []
        self.all_tables = []
        self.comments = {}
        self.updated = False
        self.errors = []

        LOGGER.info("processing Extended CSV")
        blocks = re.split("#", content)
        if len(blocks) == 0:
            msg = "no tables found"
            LOGGER.error(msg)
        # get rid of first element of cruft
        head_comment = blocks.pop(0)
        c = StringIO(head_comment.strip())
        for line in c:
            if all([line.strip() != "", line.strip() != os.linesep, line[0] != "*"]):
                self.errors.append(_violation_lookup(9))
        self.table_count = {}
        for b in blocks:
            # determine delimiter
            if "::" in b:
                b.replace("::", ",")
            if ";" in b:
                b.replace(";", ",")
            if "$" in b:
                b.replace("$", ",")
            if "%" in b:
                b.replace("%", ",")
            try:
                s = StringIO(b.strip())
                c = csv.reader(s)
                header = (c.next()[0]).strip()
            except Exception as err:
                self.errors.append(_violation_lookup(0))
            if header not in header_fields:  # metadata
                if header not in self.sections:
                    self.sections[header] = {}
                    self.metadata_tables.append(header)
                    self.table_count[header] = 1
                    self.all_tables.append(header)
                else:
                    self.table_count[header] = self.table_count[header] + 1
                    header = "%s%s" % (header, self.table_count[header])
                    self.sections[header] = {}
                    self.metadata_tables.append(header)
                self.sections[header]["_raw"] = b.strip()
                try:
                    fields = c.next()
                    if len(fields[0]) > 0:
                        if fields[0][0] == "*":
                            self.errors.append(_violation_lookup(8))
                except StopIteration:
                    msg = "Extended CSV table %s has no fields" % header
                    LOGGER.info(msg)
                    self.errors.append(_violation_lookup(140, header))
                values = None
                try:
                    values = c.next()
                    if len(values[0]) > 0:
                        if values[0][0] == "*":
                            self.errors.append(_violation_lookup(8))
                except StopIteration:
                    msg = "Extended CSV table %s has no values" % header
                    LOGGER.info(msg)
                    self.errors.append(_violation_lookup(140, header))
                    continue
                try:
                    anything_more = (c.next()[0]).strip()
                    if all(
                        [
                            anything_more is not None,
                            anything_more != "",
                            anything_more != os.linesep,
                            "*" not in anything_more,
                        ]
                    ):
                        self.errors.append(_violation_lookup(140, header))
                except Exception as err:
                    LOGGER.warning(err)
                if len(values) > len(fields):
                    self.errors.append(_violation_lookup(7, header))
                    continue
                i = 0
                for field in fields:
                    field = field.strip()
                    try:
                        self.sections[header][field] = (values[i]).strip()
                        i += 1
                    except (KeyError, IndexError):
                        self.sections[header][field] = None
                        msg = "corrupt format section %s skipping" % header
                        LOGGER.debug(msg)
            else:  # payload
                buf = StringIO(None)
                w = csv.writer(buf)
                columns = None
                for row in c:
                    if columns is None:
                        columns = row
                    if all([row != "", row is not None, row != []]):
                        if "*" not in row[0]:
                            w.writerow(row)
                        else:
                            if columns[0].lower() == "time":
                                self.errors.append(_violation_lookup(21))
                if header not in self.sections:
                    self.all_tables.append(header)
                    self.data_tables.append(header)
                    self.table_count[header] = 1
                else:
                    self.table_count[header] = self.table_count[header] + 1
                    header = "%s%s" % (header, self.table_count[header])
                    self.sections[header] = {}
                    self.data_tables.append(header)
                self.sections[header] = {"_raw": buf.getvalue()}
        # objectify comments found in file
        # preserve order of occurence
        hash_detected = False
        table = None
        comments_list = []
        table_count = {}
        for line in content.splitlines():
            if "#" in line:  # table detected
                if not hash_detected:
                    self.comments["header_comments"] = comments_list
                    comments_list = []
                    table = line[1:].strip()
                    if table in table_count.keys():
                        table_count[table] = table_count[table] + 1
                        table = "%s_%s" % (table, table_count[table])
                    else:
                        table_count[table] = 1
                    hash_detected = True
                    continue
                self.comments[table] = comments_list
                table = line[1:].strip()
                if table in table_count.keys():
                    table_count[table] = table_count[table] + 1
                    table = "%s_%s" % (table, table_count[table])
                else:
                    table_count[table] = 1
                comments_list = []
                continue
            # comments are prefixed by '*' in column 0 of each line
            if line.startswith("*"):  # comment detected,
                comments_list.append(line.strip("\n"))
        self.comments[table] = comments_list

        # check for required table presence
        if "CONTENT" not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, "CONTENT"))
        if "DATA_GENERATION" not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, "DATA_GENERATION"))
        if "INSTRUMENT" not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, "INSTRUMENT"))
        if "PLATFORM" not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, "PLATFORM"))
        if "LOCATION" not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, "LOCATION"))
        if "TIMESTAMP" not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, "TIMESTAMP"))

        if len(self.errors) != 0:
            self.errors = list(set(self.errors))
            msg = "Unable to parse extended CSV file"
            raise WOUDCExtCSVReaderError(msg, self.errors)
	def Order_Content(self):
		
		if(self.content_ordered == False):
			prefix = [ '1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.', '10.', '11.', '12.' ]
			line = StringIO(self.toString())
			active = []
			ignore = []
			skip = False
			
			line.readline()
			line.readline()

			self.info['Class'] = line.readline()

			temp = line.readline()
			if temp != "\n":
				self.info['Date_Time'] = temp
			else:
				self.info['Date_Time'] = line.readline()

			line.readline()

			self.info['Ring_Table'] = line.readline()

			line.close()

			line = StringIO(self.toString())

			for next in line:
				
				if next == '\n':
					continue
				elif next == 'HORSE\n':
					active = self.info['Horse']
					skip = True
				elif 'RIDER (NATIONALITY)' in next:
					active = self.info[ 'Rider' ]
					skip = True
				elif (next == 'Horse \n') or (next == 'Order \n') or (next == 'Horse\n') or (next == 'Order\n') or (next == '#\n') or (next == 'of\n'):
					active = ignore
					skip = False
				else:
					active.append( next )

				if skip:
					try:
						line.next()
					except StopIteration:
						break
					try:
						line.next()
					except StopIteration:
						break

			self.content_ordered = True

			Len = len(self.info.get('Rider'))
			for i in range(1, 13):
				try:
					if any( num in self.info['Rider'][Len - i] for num in prefix ):
						# print self.info['Rider'][Len-1]
						self.info['Rider'].pop( Len - i )
				except IndexError:
					pass

			Len = len(self.info.get('Horse'))
			for i in range(1, 13):
				try:
					if any( num in self.info['Horse'][Len - i] for num in prefix ):
						self.info['Horse'].pop( Len - i )
				except IndexError:
					pass
			

		return self.info
Example #14
0
 def next(self, chunk_size=None):
     return StringIO.next(self)
Example #15
0
def mutation_search(request):

    valid_fields = ['Start-Pos', 'End-Pos', 'Gene-Name', 'Mutation',
                    'Mutation-Descriptions', 'Effect-Type', 'HIVGene',
                    'Interaction-Type', 'HumanGene', 'Articles']
    show_fields = ['Start-Pos', 'End-Pos', 'Gene-Name', 'Mutation',
                    'Mutation-Descriptions', 'Articles']

    if request.method == 'POST':

        form = MutationSearch(request.POST)
        if form.is_valid():

            handle = StringIO(form.cleaned_data['lines'])
            headers = handle.next().strip().split(',')
            print headers[0]
            req = set(['Entrez', 'Start', 'Stop'])
            extra_headers = list(set(headers) - req)
            good_lines = []
            for row in DictReader(handle, fieldnames = headers):


                valid_muts = Mutation.objects.filter(Position__gte = int(row['Start']))
                valid_muts = valid_muts.filter(Position__lte = int(row['Stop']))

                if form.cleaned_data['allow']:
                    valid_muts = valid_muts.filter(Q(Gene__Entrez = int(row['Entrez'])) or Q(Gene__isnull = True))
                else:
                    valid_muts = valid_muts.filter(Gene__Entrez = int(row['Entrez']))
                if valid_muts.exists():
                    try:
                        gene = Gene.objects.get(Entrez = int(row['Entrez']))
                    except MultipleObjectsReturned:
                        gene = Gene.objects.filter(Entrez = int(row['Entrez']))[0]



                    good_lines.append({'Gene':gene, 'labels':dict(map(lambda x: (x,row[x]), extra_headers)),
                                       'Position':(row['Start'], row['Stop']),
                                        'Mutations':valid_muts})

            order = map(lambda x: x.strip(), form.cleaned_data['order'].split(','))
            dict_list, field_names = MakeDictList(good_lines, order = order, show_fields = show_fields)

            context = {
                'form':form,
                'good_lines':dict_list,
                'extra_headers':extra_headers,
                'field_names':field_names,
                'sortable_fields':field_names
            }
            if form.cleaned_data['csv_format']:
                resp = HttpResponse()
                resp['Content-Disposition'] = 'attachment; filename=mutation_results.csv'

                writter = DictWriter(resp, field_names, for_csv = True)
                for line in dict_list:
                    writter.writerow(line)

                return resp
            else:
                return render_to_response('Interaction/mutation_search.html', context,
                                          context_instance = RequestContext(request))
    else:
        form = MutationSearch(initial = {'order':'Gene-Name, Mutation'})


    context = {
        'form':form,
        'sortable_fields':valid_fields
    }    

    return render_to_response('Interaction/mutation_search.html', context,
                                context_instance = RequestContext(request))
    def __init__(self, content):
        """
        Parse WOUDC Extended CSV into internal data structure

        :param content: string buffer of content
        """

        header_fields = [
            'PROFILE',
            'DAILY',
            'GLOBAL',
            'DIFFUSE',
            # 'MONTHLY',
            'OZONE_PROFILE',
            'N14_VALUES',
            'C_PROFILE',
            'OBSERVATIONS',
            'PUMP_CORRECTION',
            'SIMULTANEOUS',
            'DAILY_SUMMARY',
            'DAILY_SUMMARY_NSF',
            'SAOZ_DATA_V2',
            'GLOBAL_DAILY_TOTALS'
        ]

        self.sections = {}
        self.metadata_tables = []
        self.data_tables = []
        self.all_tables = []
        self.comments = {}
        self.updated = False
        self.errors = []

        LOGGER.info('processing Extended CSV')
        blocks = re.split('#', content)
        if len(blocks) == 0:
            msg = 'no tables found'
            LOGGER.error(msg)
        # get rid of first element of cruft
        head_comment = blocks.pop(0)
        c = StringIO(head_comment.strip())
        for line in c:
            if all([line.strip() != '', line.strip() != os.linesep,
                    line[0] != '*']):
                self.errors.append(_violation_lookup(9))
        self.table_count = {}
        for b in blocks:
            # determine delimiter
            if '::' in b:
                b.replace('::', ',')
            if ';' in b:
                b.replace(';', ',')
            if '$' in b:
                b.replace('$', ',')
            if '%' in b:
                b.replace('%', ',')
            try:
                s = StringIO(b.strip())
                c = csv.reader(s)
                header = (c.next()[0]).strip()
            except Exception as err:
                self.errors.append(_violation_lookup(0))
            if header not in header_fields:  # metadata
                if header not in self.sections:
                    self.sections[header] = {}
                    self.metadata_tables.append(header)
                    self.table_count[header] = 1
                    self.all_tables.append(header)
                else:
                    self.table_count[header] = self.table_count[header] + 1
                    header = '%s%s' % (header, self.table_count[header])
                    self.sections[header] = {}
                    self.metadata_tables.append(header)
                self.sections[header]['_raw'] = b.strip()
                try:
                    fields = c.next()
                    if len(fields[0]) > 0:
                        if fields[0][0] == '*':
                            self.errors.append(_violation_lookup(8))
                except StopIteration:
                    msg = 'Extended CSV table %s has no fields' % header
                    LOGGER.info(msg)
                    self.errors.append(_violation_lookup(140, header))
                values = None
                try:
                    values = c.next()
                    if len(values[0]) > 0:
                        if values[0][0] == '*':
                            self.errors.append(_violation_lookup(8))
                except StopIteration:
                    msg = 'Extended CSV table %s has no values' % header
                    LOGGER.info(msg)
                    self.errors.append(_violation_lookup(140, header))
                    continue
                try:
                    anything_more = (c.next()[0]).strip()
                    if all([anything_more is not None, anything_more != '',
                            anything_more != os.linesep,
                            '*' not in anything_more]):
                        self.errors.append(_violation_lookup(140, header))
                except Exception as err:
                    LOGGER.warning(err)
                if len(values) > len(fields):
                    self.errors.append(_violation_lookup(7, header))
                    continue
                i = 0
                for field in fields:
                    field = field.strip()
                    try:
                        self.sections[header][field] = (values[i]).strip()
                        i += 1
                    except (KeyError, IndexError):
                        self.sections[header][field] = None
                        msg = 'corrupt format section %s skipping' % header
                        LOGGER.debug(msg)
            else:  # payload
                buf = StringIO(None)
                w = csv.writer(buf)
                columns = None
                for row in c:
                    if columns is None:
                        columns = row
                    if all([row != '', row is not None, row != []]):
                        if '*' not in row[0]:
                            w.writerow(row)
                        else:
                            if columns[0].lower() == 'time':
                                self.errors.append(_violation_lookup(21))
                if header not in self.sections:
                    self.all_tables.append(header)
                    self.data_tables.append(header)
                    self.table_count[header] = 1
                else:
                    self.table_count[header] = self.table_count[header] + 1
                    header = '%s%s' % (header, self.table_count[header])
                    self.sections[header] = {}
                    self.data_tables.append(header)
                self.sections[header] = {'_raw': buf.getvalue()}
        # objectify comments found in file
        # preserve order of occurence
        hash_detected = False
        table = None
        comments_list = []
        table_count = {}
        for line in content.splitlines():
            if '#' in line:  # table detected
                if not hash_detected:
                    self.comments['header_comments'] = comments_list
                    comments_list = []
                    table = line[1:].strip()
                    if table in table_count.keys():
                        table_count[table] = table_count[table] + 1
                        table = '%s_%s' % (table, table_count[table])
                    else:
                        table_count[table] = 1
                    hash_detected = True
                    continue
                self.comments[table] = comments_list
                table = line[1:].strip()
                if table in table_count.keys():
                    table_count[table] = table_count[table] + 1
                    table = '%s_%s' % (table, table_count[table])
                else:
                    table_count[table] = 1
                comments_list = []
                continue
            # comments are prefixed by '*' in column 0 of each line
            if line.startswith('*'):  # comment detected,
                comments_list.append(line.strip('\n'))
        self.comments[table] = comments_list

        # check for required table presence
        if 'CONTENT' not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, 'CONTENT'))
        if 'DATA_GENERATION' not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, 'DATA_GENERATION'))
        if 'INSTRUMENT' not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, 'INSTRUMENT'))
        if 'PLATFORM' not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, 'PLATFORM'))
        if 'LOCATION' not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, 'LOCATION'))
        if 'TIMESTAMP' not in self.metadata_tables:
            self.errors.append(_violation_lookup(1, 'TIMESTAMP'))

        if len(self.errors) != 0:
            self.errors = list(set(self.errors))
            msg = \
                'Unable to parse extended CSV file. Due to: %s' % \
                ','.join(self.errors)
            raise WOUDCExtCSVReaderError(msg)
Example #17
0
 def next(self, chunk_size=None):
     return StringIO.next(self)