def main(): args = parse_args() input_file = utils.open_compressed_file(args.input_file) conn = sqlite3.connect(str(args.sqlite_file)) create_tables(conn) conn.execute('PRAGMA synchronous = OFF') conn.execute('PRAGMA journal_mode = MEMORY') print('Inserting data...') with input_file, conn: reader = csv.reader(input_file) assert next(reader) == ['timestamp', 'from', 'to'] records = (parse_record(r) for r in reader) db_records = ( (r.timestamp, args.project, r.from_, r.to) for r in records ) conn.executemany( 'INSERT INTO moves VALUES (?, ?, ?, ?)', db_records, ) print('Creating indexes...') if args.create_indexes: create_indexes()
def main(): args = parse_args() db_conn = pymysql.connect(**args.mysql_url) insert_tpl = ''' INSERT INTO `mag_papers` ( `paper_id`, `original_paper_title`, `normalized_paper_title`, `paper_publish_year`, `paper_publish_date`, `paper_doi`, `original_venue_name`, `normalized_venue_name`, `journal_id_mapped_to_venue_name`, `conference_series_id_mapped_to_venue_name`, `paper_rank` ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ''' if args.create_tables: print('Creating tables and indexes') create_tables_and_indexes(db_conn.cursor()) db_conn.commit() print('Reading', args.input_csv, '...') input_file = utils.open_compressed_file(args.input_csv) cursor = db_conn.cursor() with input_file, cursor: csvreader = csv.reader( input_file, delimiter='\t', quoting=csv.QUOTE_NONE, ) records = (parse_papers_record(r) for r in csvreader) records_truncated = (( r.paper_id[:50], r.original_paper_title[:255], r.normalized_paper_title[:255], r.paper_publish_year, r.paper_publish_date, r.paper_doi[:255], r.original_venue_name[:255], r.normalized_venue_name[:255], r.journal_id_mapped_to_venue_name[:255], r.converence_series_id_mapped_to_venue_name[:255], r.paper_rank, ) for r in records) records_with_progress = frogress.bar( records_truncated, steps=args.expected_records, ) cursor.executemany(insert_tpl, records_with_progress) db_conn.commit()
def main(): logging.config.fileConfig('logging.ini', disable_existing_loggers=False) args = parse_args() parser_cls = bacparser.parsers.get_parser_cls(args.year) if args.format == 'python': write = functools.partial(write_python, args.output) else: # 'pickle' write = functools.partial(write_pickle, args.output) with args.output: for filename in args.filenames: with open_compressed_file(filename) as f: logging.info("Extracting from %s" % (filename, )) for i in parser_cls(f): write(i)
def main(): logging.config.fileConfig("logging.ini", disable_existing_loggers=False) args = parse_args() parser_cls = bacparser.parsers.get_parser_cls(args.year) if args.format == "python": write = functools.partial(write_python, args.output) else: # 'pickle' write = functools.partial(write_pickle, args.output) with args.output: for filename in args.filenames: with open_compressed_file(filename) as f: logging.info("Extracting from %s" % (filename,)) for i in parser_cls(f): write(i)
def main(): logging.config.fileConfig('logging.ini', disable_existing_loggers=False) args = parse_args() parser = bacparser.parsers.get_parser(args.year) if args.format == 'python': write = functools.partial(write_python, args.output) else: # 'pickle' write = functools.partial(write_pickle, args.output) with args.output: for filename in args.filenames: with open_compressed_file(filename) as f: logging.info("Extracting from %s" % (filename,)) main_table = get_main_table_from_file(f, args.year) for i in parser.get_elev(main_table): write(i)
def main(): logging.config.fileConfig('logging.ini', disable_existing_loggers=False) args = parse_args() with args.output: csv_writer = UnicodeWriter(args.output) csv_writer.writerow(bacparser.models.get_model(args.year)._fields) for filename in args.filenames: logging.info('Converting %s' % (filename,)) with open_compressed_file(filename) as f: unpickler = pickle.Unpickler(f) try: while True: o = unpickler.load() csv_writer.writerow(o) except EOFError: pass
def main(): args = parse_args() db_conn = pymysql.connect(**args.mysql_url) insert_tpl = """ INSERT INTO `identifiershistory` ( `project`, `page_id`, `page_title`, `identifier_type`, `identifier_id`, `start_date`, `end_date` ) VALUES (%s, %s, %s, %s, %s, %s, %s) """ if args.create_tables: print("Creating tables and indexes") create_tables_and_indexes(db_conn.cursor()) db_conn.commit() for file_path in args.input_files: print("Reading", file_path, "...") input_file = utils.open_compressed_file(file_path) cursor = db_conn.cursor() with input_file, cursor: csvreader = csv.reader(input_file) records = (utils.parse_identifier_history_record(r) for r in csvreader) records_truncated = ( ( r.project, r.page_id, r.page_title[:255], r.identifier_type[:20], r.identifier_id[:255], r.start_date, r.end_date, ) for r in records ) cursor.executemany(insert_tpl, records_truncated) db_conn.commit()
def main(): args = parse_args() db_conn = pymysql.connect(**args.mysql_url) insert_tpl = ''' INSERT INTO `identifiershistory` ( `project`, `page_id`, `page_title`, `identifier_type`, `identifier_id`, `start_date`, `end_date` ) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' if args.create_tables: print('Creating tables and indexes') create_tables_and_indexes(db_conn.cursor()) db_conn.commit() for file_path in args.input_files: print('Reading', file_path, '...') input_file = utils.open_compressed_file(file_path) cursor = db_conn.cursor() with input_file, cursor: csvreader = csv.reader(input_file) records = (utils.parse_identifier_history_record(r) for r in csvreader) records_truncated = (( r.project, r.page_id, r.page_title[:255], r.identifier_type[:20], r.identifier_id[:255], r.start_date, r.end_date, ) for r in records) cursor.executemany(insert_tpl, records_truncated) db_conn.commit()
def main(): args = parse_args() move_actions = set(['move', 'move_redir']) input_file = utils.open_compressed_file(args.input_file) # output_file = gzip.open(str(args.output_file), 'wt', encoding='utf-8') output_file = sys.stdout with input_file, output_file: writer = csv.writer(output_file) writer.writerow(('timestamp', 'from', 'to')) logitems = iter_elems( input_file, tag='{http://www.mediawiki.org/xml/export-0.10/}logitem', ) for logitem in logitems: action = logitem.find( '{http://www.mediawiki.org/xml/export-0.10/}action') if action.text not in move_actions: continue params = logitem.find( '{http://www.mediawiki.org/xml/export-0.10/}params') logtitle = logitem.find( '{http://www.mediawiki.org/xml/export-0.10/}logtitle') if params is None or logtitle is None: continue redirect = get_redirect(params.text) timestamp = logitem.find( '{http://www.mediawiki.org/xml/export-0.10/}timestamp') writer.writerow(( timestamp.text, logtitle.text, redirect, ))
def main(): args = parse_args() move_actions = set(['move', 'move_redir']) input_file = utils.open_compressed_file(args.input_file) # output_file = gzip.open(str(args.output_file), 'wt', encoding='utf-8') output_file = sys.stdout with input_file, output_file: writer = csv.writer(output_file) writer.writerow(('timestamp', 'from', 'to')) logitems = iter_elems( input_file, tag='{http://www.mediawiki.org/xml/export-0.10/}logitem', ) for logitem in logitems: action = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}action') if action.text not in move_actions: continue params = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}params') logtitle = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}logtitle') if params is None or logtitle is None: continue redirect = get_redirect(params.text) timestamp = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}timestamp') writer.writerow(( timestamp.text, logtitle.text, redirect, ))
def main(): args = parse_args() print(args) args.output_dir.mkdir(parents=True, exist_ok=True) # sqlite_conn = sqlite3.connect( # str(args.moves_sqlite), # detect_types=sqlite3.PARSE_DECLTYPES, # ) # sqlite_conn.row_factory = sqlite3.Row # ipdb.set_trace() ######### Break Point ########### # # r=get_page_periods(moves_conn, 'en', '\'Abd al-Rahman I') # periods = get_page_periods(moves_conn, 'en', 'Spanish conquest of Chiapas') db_url = args.db_url db_vars = dict( host=db_url.hostname, port=db_url.port or 3306, user=db_url.username, password=db_url.password or '', database=db_url.path.rpartition('/')[-1], charset='utf8', ) print(db_vars) db_conn = pymysql.connect( **db_vars ) counts_finder = pagecountssearch.Finder(args.counts_dataset_dir) views_counter = ViewsCounter( counts_finder, start_period=args.counts_period_start, end_period=args.counts_period_end, granularity=datetime.timedelta(hours=1), ) for input_file_path in args.input_files: input_file = utils.open_compressed_file(input_file_path) basename = input_file_path.name output_file_path = args.output_dir/basename output_file = output_file_path.open('wt', encoding='utf-8') with input_file, output_file: raw_records = csv.reader(input_file) input_records = (parse_record(r) for r in raw_records) output_records = ( OutputRecord( *r, counts_for_page( db_conn, views_counter, r.project, r.page_id, r.page_title, r.start_date, r.end_date, ), ) for r in input_records ) writer = csv.writer(output_file) for output_record in output_records: writer.writerow(output_record)
def main(): args = parse_args() db_conn = pymysql.connect(**args.mysql_url) insert_tpl = ''' INSERT INTO `mag_papers` ( `paper_id`, `original_paper_title`, `normalized_paper_title`, `paper_publish_year`, `paper_publish_date`, `paper_doi`, `original_venue_name`, `normalized_venue_name`, `journal_id_mapped_to_venue_name`, `conference_series_id_mapped_to_venue_name`, `paper_rank` ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ''' if args.create_tables: print('Creating tables and indexes') create_tables_and_indexes(db_conn.cursor()) db_conn.commit() print('Reading', args.input_csv, '...') input_file = utils.open_compressed_file(args.input_csv) cursor = db_conn.cursor() with input_file, cursor: csvreader = csv.reader( input_file, delimiter='\t', quoting=csv.QUOTE_NONE, ) records = ( parse_papers_record(r) for r in csvreader ) records_truncated = ( ( r.paper_id[:50], r.original_paper_title[:255], r.normalized_paper_title[:255], r.paper_publish_year, r.paper_publish_date, r.paper_doi[:255], r.original_venue_name[:255], r.normalized_venue_name[:255], r.journal_id_mapped_to_venue_name[:255], r.converence_series_id_mapped_to_venue_name[:255], r.paper_rank, ) for r in records ) records_with_progress = frogress.bar( records_truncated, steps=args.expected_records, ) cursor.executemany(insert_tpl, records_with_progress) db_conn.commit()