def export_as_csv(modeladmin, request, queryset): """ Generic csv export admin action. Based on http://djangosnippets.org/snippets/2712/ """ model = modeladmin.model opts = modeladmin.model._meta field_names = [field.name for field in opts.fields] if exclude: field_names = [f for f in field_names if f not in exclude] elif fields: # copy the list, not just assign the list field_names = list(fields) labels = prep_label(model, field_names) if extra: field_names += extra labels += prep_extra_label(model, extra) response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename=%s.csv' % ( unicode(opts).replace('.', '_') ) writer = UnicodeWriter(response) if header: writer.writerow(labels) for obj in queryset: writer.writerow([prep_field(obj, field) for field in field_names]) return response
def parse_statistics(self): time_spent = time.time() - self.start_time result_string = 'Done!\nSuccessfully parsed {} items.\nFailed to parse {} items.\nSkipped {} items.\nTime spent:{}'.format( self.parsed_successfully, self.parsed_unsuccessfully, self.skipped, time_spent) print result_string if self.savestats: output_file_name = 'parse_statistics_' + timezone.now().strftime( "%Y%m%d-%H%m") + '.txt' failed_csv_output_file = 'failed_rows_' + timezone.now().strftime( "%Y%m%d-%H%m") + '.csv' result_string += '\n Failed rows saved to {}'.format( failed_csv_output_file) print 'Saving extended statistics to file: {}'.format( output_file_name) f = open(output_file_name, 'w') f.write(result_string) f.close() with open(failed_csv_output_file, 'wb') as csv_output: writer = UnicodeWriter(csv_output, quotechar='"', delimiter=';') output_file_name.write('Next rows failed to be parsed\n') for r in self.failed_rows: try: for i in xrange(len( r)): # cleaner way "for x in r" fails to work r[i] = r[i].decode('UTF-8') writer.writerow(r) except BaseException as e: print e print r
def __init__(self, corpus, parser=sentence_parse, compute_similarities=True): """Return a new ingester for the corpus. parser may be sentence_parse or ngram_parser(n) Client must insure that no other ingester is running concurrently on the same corpus. """ self.corpus = corpus self.parser = parser self.should_compute_similarities = compute_similarities max_doc_id = corpus.max_doc_id() self.next_id = max_doc_id + 1 if max_doc_id is not None else 0 self.document_file = tempfile.TemporaryFile() self.document_writer = UnicodeWriter(self.document_file) self.occurrence_file = tempfile.TemporaryFile() self.sequencer = PhraseSequencer(corpus)
def artists(request): response = HttpResponse(content_type='text/plain') response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="artists.csv"' writer = UnicodeWriter(response) fieldNames = TArtist.field_names() writer.writerow(fieldNames) for artist in TArtist.objects.all()[:1000]: values = [] for fieldName in fieldNames: values.append(artist.field_value(fieldName)) writer.writerow(values) return response
def csvwrite(filename, verbose=None, codepage="cp1252"): """ Write the file on the desired format """ records = 0 table = dbf.Table(filename, codepage=codepage) table.open() with open("{f}.csv".format(f=filename), "w") as csvfile: # Include the first line as a comma-separated line, no-quoting csvfile.write(",".join(table.field_names)+"\n") csvwriter = UnicodeWriter(csvfile, quoting=QUOTE_ALL ) for records, row in enumerate(table, start=1): csvwriter.writerow([field.strip() for field in row]) if verbose: # print a dot every 1000 records if records % 1000 == 0: print('.', end="") return records
def process_folder(folder): """ Writes annotations from all FoLiA .xml-files from the given folder to a .csv-file. """ # Create an output .csv-file with open(os.path.join(folder, 'annotations.csv'), 'wb') as csv_file: csv_file.write(u'\uFEFF'.encode('utf-8')) # the UTF-8 BOM to hint Excel we are using that... csv_writer = UnicodeWriter(csv_file, delimiter=';') csv_writer.writerow(['tekstnummer', 'zin nr', 'geannoteerde passage', 'correctie', 'eenheid', 'probleem', 'woordsoort', 'originele zin', 'gecorrigeerde zin']) # Loop over all .xml-files in the given folder for filename in glob.glob(os.path.join(folder, '*.xml')): print 'Processing ', filename process_file(csv_writer, filename)
def parse_statistics(self): time_spent = time.time() - self.start_time result_string = 'Done!\nSuccessfully parsed {} items.\nFailed to parse {} items.\nSkipped {} items.\nTime spent:{}'.format( self.parsed_successfully, self.parsed_unsuccessfully, self.skipped, time_spent) print result_string if self.savestats: output_file_name = 'parse_statistics_' + timezone.now().strftime("%Y%m%d-%H%m") + '.txt' failed_csv_output_file = 'failed_rows_' + timezone.now().strftime("%Y%m%d-%H%m") + '.csv' result_string += '\n Failed rows saved to {}'.format(failed_csv_output_file) print 'Saving extended statistics to file: {}'.format(output_file_name) f = open(output_file_name, 'w') f.write(result_string) f.close() with open(failed_csv_output_file, 'wb') as csv_output: writer = UnicodeWriter(csv_output, quotechar='"', delimiter=';') output_file_name.write('Next rows failed to be parsed\n') for r in self.failed_rows: try: for i in xrange(len(r)): # cleaner way "for x in r" fails to work r[i] = r[i].decode('UTF-8') writer.writerow(r) except BaseException as e: print e print r
def write_tweets(self, file_name, tweets, columns): if file_name is None or tweets is None: return None unicode_writer = UnicodeWriter(open(file_name, 'w')) unicode_writer.writerow(columns) for tweet in tweets: _tmp = [] for column in columns: if column in tweet: _tmp.append(tweet[column]) else: _tmp.append('') unicode_writer.writerow(_tmp)
def main(): logging.config.fileConfig('logging.ini', disable_existing_loggers=False) args = parse_args() with args.output: csv_writer = UnicodeWriter(args.output) csv_writer.writerow(bacparser.models.get_model(args.year)._fields) for filename in args.filenames: logging.info('Converting %s' % (filename,)) with open_compressed_file(filename) as f: unpickler = pickle.Unpickler(f) try: while True: o = unpickler.load() csv_writer.writerow(o) except EOFError: pass
def get_company_attr(market, urls, output_file): col_headers = [ #維基百科上原本就有的欄位 'Industry', 'Products', 'Type', 'Traded as', 'Area served', 'Parent', 'Subsidiaries', 'Key people', 'Genre', 'Services', 'Owner(s)', 'Employees', 'Website', 'Logo', ] extra_col_headers = [ #經過處理後產生的欄位 'Symbol', ] csv_writer = UnicodeWriter(open(output_file, 'wb'), delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, encoding='utf-8') csv_writer.writerow(['Wiki link'] + col_headers + extra_col_headers) re_obj = re.compile("%s : (?P<symbol>\w*)" % market) for url in company_wiki_link_generator(urls): #每個公司的wiki頁面 print url response = get_web_contents(url) d = pq(response) p = d(".infobox") infobox = p.html() #取出公司資料的table d = pq(infobox) rows = d.find('tr') data = [None] * (len(col_headers) + len(extra_col_headers)) symbol = '' for row in rows: #公司資料table中的每一列 th_txt = pq(row).find('th').text() td = pq(row).find('td') if td.attr('class') == 'logo': td_txt = td.find('img').attr('src') data[col_headers.index('Logo')] = td_txt if th_txt in col_headers: td_txt = td.text() data[col_headers.index(th_txt)] = td_txt #從Wiki的Type或Trade as欄位取的股票的Symbol if th_txt == 'Type': try: symbol = re_obj.search(td_txt).group("symbol") except Exception: pass if (not symbol) and th_txt == 'Traded as': try: symbol = re_obj.search(td_txt).group("symbol") except Exception: pass data[len(col_headers) + extra_col_headers.index("Symbol")] = symbol csv_writer.writerow([url] + data)
class DocumentIngester(object): def __init__(self, corpus, parser=sentence_parse, compute_similarities=True): """Return a new ingester for the corpus. parser may be sentence_parse or ngram_parser(n) Client must insure that no other ingester is running concurrently on the same corpus. """ self.corpus = corpus self.parser = parser self.should_compute_similarities = compute_similarities max_doc_id = corpus.max_doc_id() self.next_id = max_doc_id + 1 if max_doc_id is not None else 0 self.document_file = tempfile.TemporaryFile() self.document_writer = UnicodeWriter(self.document_file) self.occurrence_file = tempfile.TemporaryFile() self.sequencer = PhraseSequencer(corpus) def _record_document(self, text, phrases, metadata): doc_id = self.next_id self.next_id += 1 formatted_metadata = ",".join([('"%s"=>"%s"' % (key, value.replace('\\', '\\\\').replace('"', '\\"'))) for (key, value) in metadata.items()]) self.document_writer.writerow([str(self.corpus.id), str(doc_id), text, formatted_metadata]) for (phrase_id, indexes) in phrases: formatted_indexes = '"{%s}"' % ", ".join(['""(%s, %s)""' % (start, end) for (start, end) in indexes]) self.occurrence_file.write("%s,%s,%s,%s\n" % (self.corpus.id, doc_id, phrase_id, formatted_indexes)) return doc_id def _upload_new_documents(self): """Upload document text and phrase occurrences Return list of new document_ids """ self.document_file.flush() self.document_file.seek(0) self.corpus.upload_csv(self.document_file, 'documents') self.document_file.close() self.document_file = tempfile.TemporaryFile() self.document_writer = csv.writer(self.document_file) self.occurrence_file.flush() self.occurrence_file.seek(0) self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences') self.occurrence_file.close() self.occurrence_file = tempfile.TemporaryFile() def ingest(self, docs): """Ingest set of new documents""" new_doc_ids = list() print "parsing %s documents..." % len(docs) for doc in docs: if isinstance(doc, basestring): text = doc metadata = {} else: text = doc['text'] metadata = doc['metadata'] phrases = self.parser.__call__(text, self.sequencer) id = self._record_document(text, phrases, metadata) new_doc_ids.append(id) print "uploading documents..." self.sequencer.upload_new_phrases() self._upload_new_documents() if self.should_compute_similarities: print "computing similarities..." self.compute_similarities(new_doc_ids) @staticmethod def _pairs_for_comparison(all_ids, new_ids): allowed_ids = set(all_ids) all_ids = list(all_ids) all_ids.sort() new_ids = list(new_ids) new_ids.sort(reverse=True) for x in all_ids: for y in new_ids: if x >= y: break if y in allowed_ids: yield (x, y) def compute_similarities(self, new_doc_ids=None, min_similarity=0.5): docs = self.corpus.all_docs() # new_doc_ids is used to keep from recomputing already known similarities. # None is special signal to compute on all doc pairs. if new_doc_ids is None: new_doc_ids = docs.keys() with get_similarity_writer(self.corpus.id) as writer: i = 0 for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids): similarity = jaccard(docs[x], docs[y]) if similarity >= min_similarity: writer.write(x, y, similarity) i += 1 if i % 10000000 == 0: writer.flush() sys.stdout.write('.') sys.stdout.flush()
class DocumentIngester(object): def __init__(self, corpus, parser=sentence_parse, compute_similarities=True): """Return a new ingester for the corpus. parser may be sentence_parse or ngram_parser(n) Client must insure that no other ingester is running concurrently on the same corpus. """ self.corpus = corpus self.parser = parser self.should_compute_similarities = compute_similarities max_doc_id = corpus.max_doc_id() self.next_id = max_doc_id + 1 if max_doc_id is not None else 0 self.document_file = tempfile.TemporaryFile() self.document_writer = UnicodeWriter(self.document_file) self.occurrence_file = tempfile.TemporaryFile() self.sequencer = PhraseSequencer(corpus) def _record_document(self, text, phrases, metadata): doc_id = self.next_id self.next_id += 1 formatted_metadata = ",".join([ ('"%s"=>"%s"' % (key, value.replace('\\', '\\\\').replace('"', '\\"'))) for (key, value) in metadata.items() ]) self.document_writer.writerow( [str(self.corpus.id), str(doc_id), text, formatted_metadata]) for (phrase_id, indexes) in phrases: formatted_indexes = '"{%s}"' % ", ".join( ['""(%s, %s)""' % (start, end) for (start, end) in indexes]) self.occurrence_file.write( "%s,%s,%s,%s\n" % (self.corpus.id, doc_id, phrase_id, formatted_indexes)) return doc_id def _upload_new_documents(self): """Upload document text and phrase occurrences Return list of new document_ids """ self.document_file.flush() self.document_file.seek(0) self.corpus.upload_csv(self.document_file, 'documents') self.document_file.close() self.document_file = tempfile.TemporaryFile() self.document_writer = csv.writer(self.document_file) self.occurrence_file.flush() self.occurrence_file.seek(0) self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences') self.occurrence_file.close() self.occurrence_file = tempfile.TemporaryFile() def ingest(self, docs): """Ingest set of new documents""" new_doc_ids = list() print "parsing %s documents..." % len(docs) for doc in docs: if isinstance(doc, basestring): text = doc metadata = {} else: text = doc['text'] metadata = doc['metadata'] phrases = self.parser.__call__(text, self.sequencer) id = self._record_document(text, phrases, metadata) new_doc_ids.append(id) print "uploading documents..." self.sequencer.upload_new_phrases() self._upload_new_documents() if self.should_compute_similarities: print "computing similarities..." self.compute_similarities(new_doc_ids) @staticmethod def _pairs_for_comparison(all_ids, new_ids): allowed_ids = set(all_ids) all_ids = list(all_ids) all_ids.sort() new_ids = list(new_ids) new_ids.sort(reverse=True) for x in all_ids: for y in new_ids: if x >= y: break if y in allowed_ids: yield (x, y) def compute_similarities(self, new_doc_ids=None, min_similarity=0.5): docs = self.corpus.all_docs() # new_doc_ids is used to keep from recomputing already known similarities. # None is special signal to compute on all doc pairs. if new_doc_ids is None: new_doc_ids = docs.keys() with get_similarity_writer(self.corpus.id) as writer: i = 0 for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids): similarity = jaccard(docs[x], docs[y]) if similarity >= min_similarity: writer.write(x, y, similarity) i += 1 if i % 10000000 == 0: writer.flush() sys.stdout.write('.') sys.stdout.flush()