def parse_statistics(self): time_spent = time.time() - self.start_time result_string = 'Done!\nSuccessfully parsed {} items.\nFailed to parse {} items.\nSkipped {} items.\nTime spent:{}'.format( self.parsed_successfully, self.parsed_unsuccessfully, self.skipped, time_spent) print result_string if self.savestats: output_file_name = 'parse_statistics_' + timezone.now().strftime( "%Y%m%d-%H%m") + '.txt' failed_csv_output_file = 'failed_rows_' + timezone.now().strftime( "%Y%m%d-%H%m") + '.csv' result_string += '\n Failed rows saved to {}'.format( failed_csv_output_file) print 'Saving extended statistics to file: {}'.format( output_file_name) f = open(output_file_name, 'w') f.write(result_string) f.close() with open(failed_csv_output_file, 'wb') as csv_output: writer = UnicodeWriter(csv_output, quotechar='"', delimiter=';') output_file_name.write('Next rows failed to be parsed\n') for r in self.failed_rows: try: for i in xrange(len( r)): # cleaner way "for x in r" fails to work r[i] = r[i].decode('UTF-8') writer.writerow(r) except BaseException as e: print e print r
def __init__(self, corpus, parser=sentence_parse, compute_similarities=True): """Return a new ingester for the corpus. parser may be sentence_parse or ngram_parser(n) Client must insure that no other ingester is running concurrently on the same corpus. """ self.corpus = corpus self.parser = parser self.should_compute_similarities = compute_similarities max_doc_id = corpus.max_doc_id() self.next_id = max_doc_id + 1 if max_doc_id is not None else 0 self.document_file = tempfile.TemporaryFile() self.document_writer = UnicodeWriter(self.document_file) self.occurrence_file = tempfile.TemporaryFile() self.sequencer = PhraseSequencer(corpus)
def write_tweets(self, file_name, tweets, columns): if file_name is None or tweets is None: return None unicode_writer = UnicodeWriter(open(file_name, 'w')) unicode_writer.writerow(columns) for tweet in tweets: _tmp = [] for column in columns: if column in tweet: _tmp.append(tweet[column]) else: _tmp.append('') unicode_writer.writerow(_tmp)
def main(): logging.config.fileConfig('logging.ini', disable_existing_loggers=False) args = parse_args() with args.output: csv_writer = UnicodeWriter(args.output) csv_writer.writerow(bacparser.models.get_model(args.year)._fields) for filename in args.filenames: logging.info('Converting %s' % (filename,)) with open_compressed_file(filename) as f: unpickler = pickle.Unpickler(f) try: while True: o = unpickler.load() csv_writer.writerow(o) except EOFError: pass
def process_folder(folder): """ Writes annotations from all FoLiA .xml-files from the given folder to a .csv-file. """ # Create an output .csv-file with open(os.path.join(folder, 'annotations.csv'), 'wb') as csv_file: csv_file.write(u'\uFEFF'.encode('utf-8')) # the UTF-8 BOM to hint Excel we are using that... csv_writer = UnicodeWriter(csv_file, delimiter=';') csv_writer.writerow(['tekstnummer', 'zin nr', 'geannoteerde passage', 'correctie', 'eenheid', 'probleem', 'woordsoort', 'originele zin', 'gecorrigeerde zin']) # Loop over all .xml-files in the given folder for filename in glob.glob(os.path.join(folder, '*.xml')): print 'Processing ', filename process_file(csv_writer, filename)
def get_company_attr(market, urls, output_file): col_headers = [ #維基百科上原本就有的欄位 'Industry', 'Products', 'Type', 'Traded as', 'Area served', 'Parent', 'Subsidiaries', 'Key people', 'Genre', 'Services', 'Owner(s)', 'Employees', 'Website', 'Logo', ] extra_col_headers = [ #經過處理後產生的欄位 'Symbol', ] csv_writer = UnicodeWriter(open(output_file, 'wb'), delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, encoding='utf-8') csv_writer.writerow(['Wiki link'] + col_headers + extra_col_headers) re_obj = re.compile("%s : (?P<symbol>\w*)" % market) for url in company_wiki_link_generator(urls): #每個公司的wiki頁面 print url response = get_web_contents(url) d = pq(response) p = d(".infobox") infobox = p.html() #取出公司資料的table d = pq(infobox) rows = d.find('tr') data = [None] * (len(col_headers) + len(extra_col_headers)) symbol = '' for row in rows: #公司資料table中的每一列 th_txt = pq(row).find('th').text() td = pq(row).find('td') if td.attr('class') == 'logo': td_txt = td.find('img').attr('src') data[col_headers.index('Logo')] = td_txt if th_txt in col_headers: td_txt = td.text() data[col_headers.index(th_txt)] = td_txt #從Wiki的Type或Trade as欄位取的股票的Symbol if th_txt == 'Type': try: symbol = re_obj.search(td_txt).group("symbol") except Exception: pass if (not symbol) and th_txt == 'Traded as': try: symbol = re_obj.search(td_txt).group("symbol") except Exception: pass data[len(col_headers) + extra_col_headers.index("Symbol")] = symbol csv_writer.writerow([url] + data)