class TestVariablePairs(unittest.TestCase): """Verify that MINEResults correctly reports missing or extraneous variables.""" def setUp(self): self.fp = StringIO(SAMPLE_MIC) self.err = StringIO() def spin(self, var_list): """Create a MINEResults object, read all lines, return err buffer. Returns: str: of resulting err buffer contents """ m = top.MINEResults(self.fp, var_list=var_list, err=self.err) for row in m.get_rows(): pass return self.err.getvalue() def test_all_varpairs_match(self): """Verify no duplicate pairs of variables.""" # All variables present. msg = self.spin(var_list=VARS[:]) self.assertEqual(msg, "") def test_extra_variable_in_list(self): msg = self.spin(VARS[:] + ['abc']) self.assertEqual(msg, EXTRA_EXPECTED) def test_extra_mine_variable_not_in_list(self): msg = self.spin(VARS[:-1]) self.assertEqual(len(VARS[:-1]), 3) self.assertEqual(msg, EXTRA_VAR_WARNINGS) def test_extra_mine_variable_not_in_list_extra(self): m = top.MINEResults(self.fp, var_list=VARS[:-1],err=self.err) for row in m.get_rows(): pass msg = self.err.getvalue() # verify all vars in varpairs for key in VARS[:-1]: self.assertTrue(key in m.var_pairs.vars) self.assertEqual(m.var_pairs.n_set, sum(m.var_pairs.V)) self.assertEqual(m.var_pairs.n, 3) # Do not store diagonal, so number of unique pairs is n choose 2, # not sum i^2 to n self.assertEqual(m.var_pairs.n_max_pairs, 3) self.assertEqual(len(VARS[:-1]), 3) self.assertEqual(msg, EXTRA_VAR_WARNINGS) def test_all_var_but_missing_pair(self): # Skip first two lines. self.fp.next() self.fp.next() msg = self.spin(VARS[:]) self.assertEqual(msg, MISSING_PAIR_WARNING)
class FlickrFile(object): """A file-like object representing a file on flickr. Caches with a StringIO object""" def __init__(self, imageid, name, data): self.imageid = imageid self.name = name self.stringio = StringIO(data) self.closed = False self.newlines = ('\r', '\n', '\r\n') self.flush() def close(self): self.flush() self.closed = True def _stringio_get_data(self): old_seek = self.stringio.tell() self.stringio.seek(0) data = self.stringio.read() self.stringio.seek(old_seek) return data def flush(self): with tempfile.NamedTemporaryFile() as tf: data_to_png(self._stringio_get_data()).save(tf, 'png') if self.imageid: flickr.replace(filename=tf.name, photo_id=self.imageid, title=self.name, description=str(len(data)), format='bs4') else: self.imageid = flickr.upload(filename=tf.name, title=self.name, description=str(len(data)), format='bs4').photoid.text def iter(self): return self def next(self): return self.stringio.next() def read(self, size=-1): return self.stringio.read(size) def readline(self, size=-1): return self.stringio.read(size) def seek(self, offset, whence=0): return self.stringio.seek(offset, whence) def tell(self): return self.stringio.tell() def truncate(self, size=0): return self.stringio.truncate(size) def write(self, data): return self.stringio.write(data) def writelines(self, seq): return self.stringio.writelines(seq)
def testSample(self): mockInput = MagicMock(return_value=StringIO(self.sampleInput)) output = StringIO() mockOutput = MagicMock(return_value=output) with patch("__builtin__.open", mockInput): inputFile = FileRecordStream("input_path") with patch("__builtin__.open", mockOutput): outputFile = FileRecordStream("output_path", fields=inputFile.getFields(), write=True) anomalyzer.sample(inputFile, outputFile, 1) result = StringIO(output.getvalue()) result.next() result.next() result.next() reader = csv.reader(result) _, value = reader.next() self.assertIn(int(value), (1, 2, 3, 4, 5, 6)) self.assertRaises(StopIteration, result.next)
def get_discrete_data(sites): ''' Gets the time series of discrete discharge field measurements Keyword Arguments: sites: site id to be queried Returns: Panda Dataframe of Time, Timezone, and Discharge ''' params = {'site_no': sites, 'agency_cd': 'USGS', 'format': 'rdb_expanded'} #query NWIS r = requests.get('https://waterdata.usgs.gov/nwis/measurements/', params=params) #Get first entry of data match = re.search('\nUSGS\t', r.text) match.span()[0] #save string to string buffer string_io = StringIO(r.text[match.span()[0]:]) string_io.seek(0) string_io.next() #read in table and adjust timezone to UTC discrete_df = pd.read_table(string_io, header=None, engine='c', sep='\t') discrete_df[3] = pd.to_datetime(discrete_df[3]) discrete_df[3] = [ discrete_df[3].values[x] - uc.translate_tz(y) for x, y in zip( range(0, len(discrete_df[3].values)), discrete_df[4].values) ] discrete_df = discrete_df[[3, 4, 9]] discrete_df.columns = [["Time", "Tz", "Q"]] return discrete_df
def process_phylip_dist_mat(dist_data, trans_names): """Converts the output of make_phylip_seq_dist_mat into a format appropriate to use in downstream analysis. dist_data A string of the dist_data as returned from the phylip protdist and dnadist programs. trans_names A dict mapping the translated names used in the phylip alignment format with the original names. Returns: dmat -- A dict() keyed by all pairs of sequences with the genetic distance as the value. This can be used in the check_distance_pvals function. """ rev_dict = dict((val, key) for key, val in trans_names.items()) handle = StringIO(dist_data) handle.next() dist_groups = [] for line in handle: if line.startswith('Seq-'): dist_groups.append('') dist_groups[-1] += line omat = {} tmpl = 'Seq-%i' for seq_num, group in enumerate(dist_groups): parts = group.split()[1:] for onum, val in enumerate(parts): nkey = (rev_dict[tmpl % seq_num], rev_dict[tmpl % onum]) nval = float(val) if nval >= 0: omat[nkey] = nval return omat
def url_chunker(url, chunksize=1024): """Returns an iterator over contents of a file *Params* #file - an open FILE object #chunksize - how many lines to read at once? """ #url=book[0] #bookname=book[1] user_agent = {'User-agent': 'Mozilla/5.0'} result=requests.get(url,headers=user_agent) try: doc = result.content except: raise Exception("URL "+url+"not responding") text_in=StringIO(doc) chunks = [] stop = False while not stop: text="" for x in range(chunksize): try: text+=text_in.next() except StopIteration: chunks.append(text) stop=True break chunks.append(text) jobids = cloud.map(wordcount, [(url,c) for c in chunks]) cloud.join(jobids,deadlock_check=False) results = cloud.result(jobids) index=reduce_results(results) mongo_insert(index) return "OK"
def url_chunker(url, chunksize=1024): """Returns an iterator over contents of a file *Params* #file - an open FILE object #chunksize - how many lines to read at once? """ #url=book[0] #bookname=book[1] user_agent = {'User-agent': 'Mozilla/5.0'} result = requests.get(url, headers=user_agent) try: doc = result.content except: raise Exception("URL " + url + "not responding") text_in = StringIO(doc) chunks = [] stop = False while not stop: text = "" for x in range(chunksize): try: text += text_in.next() except StopIteration: chunks.append(text) stop = True break chunks.append(text) jobids = cloud.map(wordcount, [(url, c) for c in chunks]) cloud.join(jobids, deadlock_check=False) results = cloud.result(jobids) index = reduce_results(results) mongo_insert(index) return "OK"
def parse_problem(f, filtr=None): def parse(row): raw = row.split() foo = dict([(hdr, types[hdr](raw[idx])) for idx,hdr in enumerate(headers)]) return foo def parse_setting(line): return [field.strip(':') for field in line.split()[1:]] print "Parsing", f settings = StringIO() settings.writelines(line for line in open(f, 'r') if line.startswith("#")) settings.seek(0) settings = dict([parse_setting(line) for line in settings]) if filtr is not None and not filtr(settings): return None data = StringIO() data.writelines(line for line in open(f, 'r') if not line.startswith("#")) data.seek(0) headers = data.next().split() return settings, [parse(row) for row in data]
print orig_name, new_name node = out_tree.find_node_with_taxon_label(new_name) if node: node.taxon = orig_name return out_tree # <codecell> names, dmat = make_phylip_seq_dist_mat(test_seqs[0:200].to_dict().items(), generic_protein) # <codecell> names.items()[:5] rev_dict = dict((val, key) for key, val in names.items()) handle = StringIO(dmat) nseqs = int(handle.next()) dist_groups = [] for line in handle: if line.startswith('Seq-'): dist_groups.append('') dist_groups[-1] += line omat = {} tmpl = 'Seq-%i' for seq_num, group in enumerate(dist_groups[:5]): parts = group.split()[1:] for onum, val in enumerate(parts): nkey = (rev_dict[tmpl % seq_num], rev_dict[tmpl % onum]) nval = float(val) if nval >= 0:
def Make_Repeat_Mask_Txt( self, word_size=17, gapopen=5, e_thresh=0.0001, perc_identity=90, gapextend=2, min_length=75 ): """ Run blastn on contigs in input fasta file against database dbname. Parameters set to NCBI recommended defaults for blastn. """ outfastapath = os.path.join( self.outdir, '{0}.fasta'.format(self.newrefid)) prefix = os.path.join(self.outdir, self.newrefid) maskpath = prefix + '_repmask.array' regionspath = prefix + '_repregions.array' statspath = prefix + '.stats' blastn_cline = blastn(cmd=COMPASSCFG['tools']['blast']['path'] + "blastn", db=prefix, query=outfastapath, dust='no', word_size=word_size, gapopen=gapopen, gapextend=gapextend, evalue=e_thresh, perc_identity=perc_identity, outfmt='"6 qseqid sseqid pident length qstart qend sstart send"') try: blast_out, blast_err = blastn_cline() assert not blast_err except (AppError, AssertionError) as err: raise Exception( 'Erro: Blast failed during construction of repeat mask : {0}'.format(err)) repmask_fp = open(maskpath, 'w') repregions_fp = open(regionspath, 'w') total_bp = 0 repetitive_bp = 0 num_regions = 0 # each blast_rec is result from one query sequence (contig) blast_stream = StringIO(blast_out) prev_header = None for contig_count, contig in enumerate(SeqIO.parse(outfastapath, 'fasta'), 1): if prev_header != contig.name: repregions_fp.write('>{0}\n'.format(contig.name)) prev_header = contig.name total_bp += len(contig) repmask = np.zeros(len(contig), dtype=np.bool) try: fields = blast_stream.next().split() except StopIteration: fields = None while fields and fields[0] == contig.name: contig_name, match_name = fields[:2] hit_perc_ident = float(fields[2]) hit_length, q_start, q_end, s_start, s_end = ( int(x) for x in fields[3:]) (x1, y1), (x2, y2) = sorted( ((q_start, q_end), sorted((s_start, s_end)))) if hit_length >= min_length and (contig_name != match_name or not (x2 <= x1 <= y2 and x2 <= y1 <= y2)): repmask[q_start - 1:q_end] = True try: fields = blast_stream.next().split() except StopIteration: # end of blast hits fields = None # output.bam repmask as 1 and 0, 100 per line repmask_fp.write('>{0}\n'.format(contig.name)) for i in xrange(0, len(repmask), 100): j = min(i + 100, len(repmask)) repmask_fp.write('{0}\n'.format(''.join(str(i) for i in repmask[i:j].astype(int)))) # identify postitions of repetitive regions (runs of 1s in the # repmask array) # 0-based numbering region_starts = list(np.where(repmask[1:] > repmask[:-1])[0] + 1) region_ends = list(np.where(repmask[1:] < repmask[:-1])[0] + 1) # special case: full blast hit for this contig against another # contig if repmask.all(): region_starts = [0] region_ends = [len(repmask)] # fix ends, in case regions start from the first position in the # sequence or end at the last if region_starts and ((not region_ends) or (region_starts[-1] > region_ends[-1])): region_ends.append(len(repmask)) if region_ends and ((not region_starts) or (region_starts[0] > region_ends[0])): region_starts = [0] + region_starts repregions_fp.writelines('{0}\t{1}\n'.format( rs, re) for rs, re in izip(region_starts, region_ends)) repetitive_bp += repmask.sum() num_regions += len(region_starts) repmask_fp.close() repregions_fp.close() pct_repetitive = '{0:.2f}'.format( (float(repetitive_bp) / total_bp) * 100) LE.debug( 'Info: Repetitive regions for all of {0}: {1}/{2} bp ({3}%)'.format(self.newrefid, repetitive_bp, total_bp, pct_repetitive)) # save result summary statsvalues = '\t'.join((self.newrefid, self.newrefid, str(contig_count), str(total_bp), str(repetitive_bp), str(num_regions), pct_repetitive)) with open(statspath, 'w') as o: o.write('refid\trefcd\tcontigs\tnumbp\trepetitivebp\trepregions\trepetitivepct\n{values}\n'.format( values=statsvalues)) return
def __init__(self, content): """ Parse WOUDC Extended CSV into internal data structure :param content: string buffer of content """ header_fields = [ "PROFILE", "DAILY", "GLOBAL", "DIFFUSE", # 'MONTHLY', "OZONE_PROFILE", "N14_VALUES", "C_PROFILE", "OBSERVATIONS", "PUMP_CORRECTION", "SIMULTANEOUS", "DAILY_SUMMARY", "DAILY_SUMMARY_NSF", "SAOZ_DATA_V2", "GLOBAL_DAILY_TOTALS", ] self.sections = {} self.metadata_tables = [] self.data_tables = [] self.all_tables = [] self.comments = {} self.updated = False self.errors = [] LOGGER.info("processing Extended CSV") blocks = re.split("#", content) if len(blocks) == 0: msg = "no tables found" LOGGER.error(msg) # get rid of first element of cruft head_comment = blocks.pop(0) c = StringIO(head_comment.strip()) for line in c: if all([line.strip() != "", line.strip() != os.linesep, line[0] != "*"]): self.errors.append(_violation_lookup(9)) self.table_count = {} for b in blocks: # determine delimiter if "::" in b: b.replace("::", ",") if ";" in b: b.replace(";", ",") if "$" in b: b.replace("$", ",") if "%" in b: b.replace("%", ",") try: s = StringIO(b.strip()) c = csv.reader(s) header = (c.next()[0]).strip() except Exception as err: self.errors.append(_violation_lookup(0)) if header not in header_fields: # metadata if header not in self.sections: self.sections[header] = {} self.metadata_tables.append(header) self.table_count[header] = 1 self.all_tables.append(header) else: self.table_count[header] = self.table_count[header] + 1 header = "%s%s" % (header, self.table_count[header]) self.sections[header] = {} self.metadata_tables.append(header) self.sections[header]["_raw"] = b.strip() try: fields = c.next() if len(fields[0]) > 0: if fields[0][0] == "*": self.errors.append(_violation_lookup(8)) except StopIteration: msg = "Extended CSV table %s has no fields" % header LOGGER.info(msg) self.errors.append(_violation_lookup(140, header)) values = None try: values = c.next() if len(values[0]) > 0: if values[0][0] == "*": self.errors.append(_violation_lookup(8)) except StopIteration: msg = "Extended CSV table %s has no values" % header LOGGER.info(msg) self.errors.append(_violation_lookup(140, header)) continue try: anything_more = (c.next()[0]).strip() if all( [ anything_more is not None, anything_more != "", anything_more != os.linesep, "*" not in anything_more, ] ): self.errors.append(_violation_lookup(140, header)) except Exception as err: LOGGER.warning(err) if len(values) > len(fields): self.errors.append(_violation_lookup(7, header)) continue i = 0 for field in fields: field = field.strip() try: self.sections[header][field] = (values[i]).strip() i += 1 except (KeyError, IndexError): self.sections[header][field] = None msg = "corrupt format section %s skipping" % header LOGGER.debug(msg) else: # payload buf = StringIO(None) w = csv.writer(buf) columns = None for row in c: if columns is None: columns = row if all([row != "", row is not None, row != []]): if "*" not in row[0]: w.writerow(row) else: if columns[0].lower() == "time": self.errors.append(_violation_lookup(21)) if header not in self.sections: self.all_tables.append(header) self.data_tables.append(header) self.table_count[header] = 1 else: self.table_count[header] = self.table_count[header] + 1 header = "%s%s" % (header, self.table_count[header]) self.sections[header] = {} self.data_tables.append(header) self.sections[header] = {"_raw": buf.getvalue()} # objectify comments found in file # preserve order of occurence hash_detected = False table = None comments_list = [] table_count = {} for line in content.splitlines(): if "#" in line: # table detected if not hash_detected: self.comments["header_comments"] = comments_list comments_list = [] table = line[1:].strip() if table in table_count.keys(): table_count[table] = table_count[table] + 1 table = "%s_%s" % (table, table_count[table]) else: table_count[table] = 1 hash_detected = True continue self.comments[table] = comments_list table = line[1:].strip() if table in table_count.keys(): table_count[table] = table_count[table] + 1 table = "%s_%s" % (table, table_count[table]) else: table_count[table] = 1 comments_list = [] continue # comments are prefixed by '*' in column 0 of each line if line.startswith("*"): # comment detected, comments_list.append(line.strip("\n")) self.comments[table] = comments_list # check for required table presence if "CONTENT" not in self.metadata_tables: self.errors.append(_violation_lookup(1, "CONTENT")) if "DATA_GENERATION" not in self.metadata_tables: self.errors.append(_violation_lookup(1, "DATA_GENERATION")) if "INSTRUMENT" not in self.metadata_tables: self.errors.append(_violation_lookup(1, "INSTRUMENT")) if "PLATFORM" not in self.metadata_tables: self.errors.append(_violation_lookup(1, "PLATFORM")) if "LOCATION" not in self.metadata_tables: self.errors.append(_violation_lookup(1, "LOCATION")) if "TIMESTAMP" not in self.metadata_tables: self.errors.append(_violation_lookup(1, "TIMESTAMP")) if len(self.errors) != 0: self.errors = list(set(self.errors)) msg = "Unable to parse extended CSV file" raise WOUDCExtCSVReaderError(msg, self.errors)
def Order_Content(self): if(self.content_ordered == False): prefix = [ '1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.', '10.', '11.', '12.' ] line = StringIO(self.toString()) active = [] ignore = [] skip = False line.readline() line.readline() self.info['Class'] = line.readline() temp = line.readline() if temp != "\n": self.info['Date_Time'] = temp else: self.info['Date_Time'] = line.readline() line.readline() self.info['Ring_Table'] = line.readline() line.close() line = StringIO(self.toString()) for next in line: if next == '\n': continue elif next == 'HORSE\n': active = self.info['Horse'] skip = True elif 'RIDER (NATIONALITY)' in next: active = self.info[ 'Rider' ] skip = True elif (next == 'Horse \n') or (next == 'Order \n') or (next == 'Horse\n') or (next == 'Order\n') or (next == '#\n') or (next == 'of\n'): active = ignore skip = False else: active.append( next ) if skip: try: line.next() except StopIteration: break try: line.next() except StopIteration: break self.content_ordered = True Len = len(self.info.get('Rider')) for i in range(1, 13): try: if any( num in self.info['Rider'][Len - i] for num in prefix ): # print self.info['Rider'][Len-1] self.info['Rider'].pop( Len - i ) except IndexError: pass Len = len(self.info.get('Horse')) for i in range(1, 13): try: if any( num in self.info['Horse'][Len - i] for num in prefix ): self.info['Horse'].pop( Len - i ) except IndexError: pass return self.info
def next(self, chunk_size=None): return StringIO.next(self)
def mutation_search(request): valid_fields = ['Start-Pos', 'End-Pos', 'Gene-Name', 'Mutation', 'Mutation-Descriptions', 'Effect-Type', 'HIVGene', 'Interaction-Type', 'HumanGene', 'Articles'] show_fields = ['Start-Pos', 'End-Pos', 'Gene-Name', 'Mutation', 'Mutation-Descriptions', 'Articles'] if request.method == 'POST': form = MutationSearch(request.POST) if form.is_valid(): handle = StringIO(form.cleaned_data['lines']) headers = handle.next().strip().split(',') print headers[0] req = set(['Entrez', 'Start', 'Stop']) extra_headers = list(set(headers) - req) good_lines = [] for row in DictReader(handle, fieldnames = headers): valid_muts = Mutation.objects.filter(Position__gte = int(row['Start'])) valid_muts = valid_muts.filter(Position__lte = int(row['Stop'])) if form.cleaned_data['allow']: valid_muts = valid_muts.filter(Q(Gene__Entrez = int(row['Entrez'])) or Q(Gene__isnull = True)) else: valid_muts = valid_muts.filter(Gene__Entrez = int(row['Entrez'])) if valid_muts.exists(): try: gene = Gene.objects.get(Entrez = int(row['Entrez'])) except MultipleObjectsReturned: gene = Gene.objects.filter(Entrez = int(row['Entrez']))[0] good_lines.append({'Gene':gene, 'labels':dict(map(lambda x: (x,row[x]), extra_headers)), 'Position':(row['Start'], row['Stop']), 'Mutations':valid_muts}) order = map(lambda x: x.strip(), form.cleaned_data['order'].split(',')) dict_list, field_names = MakeDictList(good_lines, order = order, show_fields = show_fields) context = { 'form':form, 'good_lines':dict_list, 'extra_headers':extra_headers, 'field_names':field_names, 'sortable_fields':field_names } if form.cleaned_data['csv_format']: resp = HttpResponse() resp['Content-Disposition'] = 'attachment; filename=mutation_results.csv' writter = DictWriter(resp, field_names, for_csv = True) for line in dict_list: writter.writerow(line) return resp else: return render_to_response('Interaction/mutation_search.html', context, context_instance = RequestContext(request)) else: form = MutationSearch(initial = {'order':'Gene-Name, Mutation'}) context = { 'form':form, 'sortable_fields':valid_fields } return render_to_response('Interaction/mutation_search.html', context, context_instance = RequestContext(request))
def __init__(self, content): """ Parse WOUDC Extended CSV into internal data structure :param content: string buffer of content """ header_fields = [ 'PROFILE', 'DAILY', 'GLOBAL', 'DIFFUSE', # 'MONTHLY', 'OZONE_PROFILE', 'N14_VALUES', 'C_PROFILE', 'OBSERVATIONS', 'PUMP_CORRECTION', 'SIMULTANEOUS', 'DAILY_SUMMARY', 'DAILY_SUMMARY_NSF', 'SAOZ_DATA_V2', 'GLOBAL_DAILY_TOTALS' ] self.sections = {} self.metadata_tables = [] self.data_tables = [] self.all_tables = [] self.comments = {} self.updated = False self.errors = [] LOGGER.info('processing Extended CSV') blocks = re.split('#', content) if len(blocks) == 0: msg = 'no tables found' LOGGER.error(msg) # get rid of first element of cruft head_comment = blocks.pop(0) c = StringIO(head_comment.strip()) for line in c: if all([line.strip() != '', line.strip() != os.linesep, line[0] != '*']): self.errors.append(_violation_lookup(9)) self.table_count = {} for b in blocks: # determine delimiter if '::' in b: b.replace('::', ',') if ';' in b: b.replace(';', ',') if '$' in b: b.replace('$', ',') if '%' in b: b.replace('%', ',') try: s = StringIO(b.strip()) c = csv.reader(s) header = (c.next()[0]).strip() except Exception as err: self.errors.append(_violation_lookup(0)) if header not in header_fields: # metadata if header not in self.sections: self.sections[header] = {} self.metadata_tables.append(header) self.table_count[header] = 1 self.all_tables.append(header) else: self.table_count[header] = self.table_count[header] + 1 header = '%s%s' % (header, self.table_count[header]) self.sections[header] = {} self.metadata_tables.append(header) self.sections[header]['_raw'] = b.strip() try: fields = c.next() if len(fields[0]) > 0: if fields[0][0] == '*': self.errors.append(_violation_lookup(8)) except StopIteration: msg = 'Extended CSV table %s has no fields' % header LOGGER.info(msg) self.errors.append(_violation_lookup(140, header)) values = None try: values = c.next() if len(values[0]) > 0: if values[0][0] == '*': self.errors.append(_violation_lookup(8)) except StopIteration: msg = 'Extended CSV table %s has no values' % header LOGGER.info(msg) self.errors.append(_violation_lookup(140, header)) continue try: anything_more = (c.next()[0]).strip() if all([anything_more is not None, anything_more != '', anything_more != os.linesep, '*' not in anything_more]): self.errors.append(_violation_lookup(140, header)) except Exception as err: LOGGER.warning(err) if len(values) > len(fields): self.errors.append(_violation_lookup(7, header)) continue i = 0 for field in fields: field = field.strip() try: self.sections[header][field] = (values[i]).strip() i += 1 except (KeyError, IndexError): self.sections[header][field] = None msg = 'corrupt format section %s skipping' % header LOGGER.debug(msg) else: # payload buf = StringIO(None) w = csv.writer(buf) columns = None for row in c: if columns is None: columns = row if all([row != '', row is not None, row != []]): if '*' not in row[0]: w.writerow(row) else: if columns[0].lower() == 'time': self.errors.append(_violation_lookup(21)) if header not in self.sections: self.all_tables.append(header) self.data_tables.append(header) self.table_count[header] = 1 else: self.table_count[header] = self.table_count[header] + 1 header = '%s%s' % (header, self.table_count[header]) self.sections[header] = {} self.data_tables.append(header) self.sections[header] = {'_raw': buf.getvalue()} # objectify comments found in file # preserve order of occurence hash_detected = False table = None comments_list = [] table_count = {} for line in content.splitlines(): if '#' in line: # table detected if not hash_detected: self.comments['header_comments'] = comments_list comments_list = [] table = line[1:].strip() if table in table_count.keys(): table_count[table] = table_count[table] + 1 table = '%s_%s' % (table, table_count[table]) else: table_count[table] = 1 hash_detected = True continue self.comments[table] = comments_list table = line[1:].strip() if table in table_count.keys(): table_count[table] = table_count[table] + 1 table = '%s_%s' % (table, table_count[table]) else: table_count[table] = 1 comments_list = [] continue # comments are prefixed by '*' in column 0 of each line if line.startswith('*'): # comment detected, comments_list.append(line.strip('\n')) self.comments[table] = comments_list # check for required table presence if 'CONTENT' not in self.metadata_tables: self.errors.append(_violation_lookup(1, 'CONTENT')) if 'DATA_GENERATION' not in self.metadata_tables: self.errors.append(_violation_lookup(1, 'DATA_GENERATION')) if 'INSTRUMENT' not in self.metadata_tables: self.errors.append(_violation_lookup(1, 'INSTRUMENT')) if 'PLATFORM' not in self.metadata_tables: self.errors.append(_violation_lookup(1, 'PLATFORM')) if 'LOCATION' not in self.metadata_tables: self.errors.append(_violation_lookup(1, 'LOCATION')) if 'TIMESTAMP' not in self.metadata_tables: self.errors.append(_violation_lookup(1, 'TIMESTAMP')) if len(self.errors) != 0: self.errors = list(set(self.errors)) msg = \ 'Unable to parse extended CSV file. Due to: %s' % \ ','.join(self.errors) raise WOUDCExtCSVReaderError(msg)