from shared_python.Tags import Tags def _clean_email(author): email = author['email'] if author['email'] is None or author['email'] == '': email = u'{0}{1}[email protected]'.format(author['name'], args.archive_name)\ .replace(' ', '').replace("'", "") if author['email'].startswith('mailto:'): email = author['email'].replace('mailto:', '') return email if __name__ == "__main__": args = Args.args_for_05() sql = Sql(args) tags = Tags(args, sql.db) final = FinalTables(args, sql.db) chaps = Chapters(args, sql.db) filter = '' coauthors = {} print "Creating destination tables in {0}".format(args.output_database) if args.archive_type == 'EF': table_names = efiction.table_names() has_coauthor_table = raw_input( "\nDoes this archive have a coauthors table? Y/N\n") has_coauthors = True if str.lower(has_coauthor_table) == 'y' else False if has_coauthors:
from shared_python.Sql import Sql from shared_python.Tags import Tags def valid_tags(key, tag_type_list): return [d[key].strip() for d in tag_type_list if key in d and d[key] is not None and d[key] != ''] if __name__ == "__main__": args_obj = Args() args = args_obj.args_for_06() log = args_obj.logger_with_filename() sql = Sql(args, log) tags = Tags(args, sql.db, log) final = FinalTables(args, sql.db, log) if args.archive_type == 'EF': table_names = efiction.table_names() else: table_names = { 'authors': 'authors', 'stories': 'stories', 'chapters': 'chapters' } log.info("Getting all tags per story...") tags_by_story_id = tags.tags_by_story_id() for (story_id, tags) in tags_by_story_id.items():
def _create_mysql(args, FILES, log): db = connect(args.db_host, args.db_user, args.db_password, "") cursor = db.cursor() DATABASE_NAME = args.temp_db_database # Use the database and empty all the tables cursor.execute(u"drop database if exists {0};".format(DATABASE_NAME)) cursor.execute(u"create database {0};".format(DATABASE_NAME)) cursor.execute(u"use {0}".format(DATABASE_NAME)) sql = Sql(args) sql.run_script_from_file('shared_python/create-open-doors-tables.sql', DATABASE_NAME) db.commit() authors = [(FILES[i].get('Author', '').strip(), FILES[i].get('Email', FILES[i].get('EmailAuthor', '')).lower().strip()) for i in FILES] auth = u"INSERT INTO authors (name, email) VALUES(%s, %s);" cursor.executemany(auth, set(authors)) db.commit() # Authors auth = u"SELECT * FROM authors;" cursor.execute(auth) db_authors = cursor.fetchall() # Stories and bookmarks stories = [( i, FILES[i].get('Title', '').replace("'", "\\'"), FILES[i].get('Summary', '').replace("'", "\\'"), _extract_tags(args, FILES[i]), _extract_characters(args, FILES[i]), datetime.datetime.strptime( FILES[i].get( 'PrintTime', FILES[i].get( 'DatePrint', FILES[i].get( 'Date', str(datetime.datetime.now().strftime('%m/%d/%y'))))), '%m/%d/%y').strftime('%Y-%m-%d'), FILES[i].get('Location', '').replace("'", "\\'"), FILES[i].get('LocationURL', FILES[i].get('StoryURL', '')).replace("'", "\\'"), FILES[i].get('Notes', '').replace("'", "\\'"), _extract_relationships(args, FILES[i]), FILES[i].get('Rating', ''), FILES[i].get('Warnings', '').replace("'", "\\'"), FILES[i].get('Author', '').strip(), FILES[i].get('Email', FILES[i].get('EmailAuthor', '')).lower().strip(), FILES[i].get('FileType', args.chapters_file_extensions) if not _is_external(FILES[i]) else 'bookmark', _extract_fandoms(args, FILES[i]), ) for i in FILES] cur = 0 total = len(FILES) for (original_id, title, summary, tags, characters, date, location, url, notes, pairings, rating, warnings, author, email, filetype, fandoms) in set(stories): cur = Common.print_progress(cur, total) try: # For AA archives with external links: if filetype != 'bookmark': if location == '': filename = url else: filename = location + '.' + filetype table_name = 'stories' else: filename = url table_name = 'bookmarks' # Clean up fandoms and add default fandom if it exists final_fandoms = fandoms.replace("'", r"\'") if args.default_fandom is not None: if final_fandoms == '' or final_fandoms == args.default_fandom: final_fandoms = args.default_fandom else: final_fandoms = args.default_fandom + ', ' + final_fandoms result = [ element for element in db_authors if element[1] == author and element[2] == email ] authorid = result[0][0] stor = u""" INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id) VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""" \ .format(table_name, original_id, final_fandoms.replace(r"\\", "\\"), title.replace(r"\\", "\\"), summary, tags, characters, date, filename, notes, pairings, rating, warnings, authorid) cursor.execute(stor) except: log.error("table name: {0}\noriginal id: {1}\nfinal fandoms: '{2}'\ntitle: '{3}'\nsummary: '{4}'\ntags: '{5}'" \ "\ncharacters: '{6}'\ndate: '{7}'\nfilename: '{8}'\nnotes: '{9}'\npairings: '{10}'\nrating: '{11}'" \ "\nwarnings: '{12}'\nauthor id: '{13}'"\ .format(table_name, original_id, final_fandoms, title, summary, tags, characters, date, filename, notes, pairings, rating, warnings, authorid)) raise db.commit()
for s in row: r.append('' if s is None else html_parser.unescape(unicode(s)). encode('utf-8')) myFile.writerows([r]) fp.close() if __name__ == "__main__": """ This step exports the Tag Wrangling and Authors with stories CSV files which you then have to import into Google Spreadsheet and share with the rest of the Open Doors committee. """ args_obj = Args() args = args_obj.args_for_03() log = args_obj.logger_with_filename() sql = Sql(args, log) tags = Tags(args, sql.db, log) log.info('Exporting tags from {0} to {1}'.format(args.temp_db_database, args.output_folder)) cols = tags.tag_export_map results = tags.distinct_tags() write_csv( '{0}/{1} - tags.csv'.format(args.output_folder, args.archive_name), [ cols['original_tagid'], cols['original_tag'], cols['original_table'], cols['original_parent'], cols['ao3_tag_fandom'], cols['ao3_tag'], cols['ao3_tag_type'], cols['ao3_tag_category'], cols['original_description'], "TW Notes" ]) log.debug('Exporting authors with stories from {0} to {1}'.format(
import re from shared_python import Args from shared_python.Sql import Sql from shared_python.Tags import Tags if __name__ == "__main__": args = Args.args_for_02() sql = Sql(args) tags = Tags(args, sql.db) print('--- Processing tags from stories table in {0}'.format( args.db_database)) tags.create_tags_table() # eg: python 01-Load-into-Mysql.py -dh localhost -du root -dt dsa -dd temp_python -a AA -f /Users/emma/OneDrive/DSA/ARCHIVE_DB.pl -o . tag_col_list = {} stories_id_name = "" stories_table_name = "" # AUTOMATED ARCHIVE if args.archive_type == 'AA': table_name = raw_input( 'Story table name (default: "{0}_stories"): '.format( args.db_table_prefix)) if table_name is None or table_name == '': table_name = '{0}_stories'.format(args.db_table_prefix) tag_columns = raw_input( 'Column names containing tags \n (delimited by commas - default: "tags, warnings, characters, fandoms, relationships"): ' ) if tag_columns is None or tag_columns == '':
# encoding: utf-8 import csv import sys from shared_python import Args from shared_python.Sql import Sql from shared_python.Tags import Tags if __name__ == "__main__": args = Args.args_for_04() sql = Sql(args) tags = Tags(args, sql.db) # Input CSV from TW spreadsheet # Rename tags in `tags` table, populate ao3_tag_table column # eg: python 04-Rename-Tags.py -dh localhost -du root -dt dsa -dd temp_python -a EF -i path/to/tw-spreadsheet.csv with open(args.tag_input_file, 'r') as csvfile: tw_tags = list(csv.DictReader(csvfile)) tag_headers = tags.tag_export_map total = len(tw_tags) for cur, row in enumerate(tw_tags): sys.stdout.write('\r{0}/{1} tags to map'.format(cur + 1, total)) sys.stdout.flush() prefix = 'fanfiction' if args.archive_type == 'EF' else '' tags.update_tag_row(row, prefix)
# encoding: utf-8 import csv import os from eFiction import efiction from shared_python import Args from shared_python.Chapters import Chapters from shared_python.FinalTables import FinalTables from shared_python.Sql import Sql from shared_python.Tags import Tags if __name__ == "__main__": args = Args.args_for_05() sql = Sql(args) filter = 'WHERE `id` in ' story_exclusion_filter = '' # Filter out DNI stories - story_ids_to_remove must be comma-separated list of DNI ids if os.path.exists(args.story_ids_to_remove): with open(args.story_ids_to_remove, "rt") as f: for line in f: story_exclusion_filter = filter + '(' + line + ')' command = "SET SQLDELETE FROM `{0}`.`{1}_stories` {2}".format( args.output_database, args.db_table_prefix, story_exclusion_filter) print command result = sql.execute(command) print result
html_parser = HTMLParser() with open(filename, 'w') as fp: myFile = csv.writer(fp) myFile.writerow(columns) for row in results: r = [] for s in row: r.append('' if s is None else html_parser.unescape(unicode(s)). encode('utf-8')) myFile.writerows([r]) fp.close() if __name__ == "__main__": args = Args.args_for_03() sql = Sql(args) tags = Tags(args, sql.db) print('--- Exporting tags from {0}'.format(args.db_database)) cols = tags.tag_export_map results = tags.distinct_tags() write_csv('{0} - tags.csv'.format(args.db_database), [ cols['original_tagid'], cols['original_tag'], cols['original_table'], cols['original_parent'], cols['ao3_tag_fandom'], cols['ao3_tag'], cols['ao3_tag_type'], cols['ao3_tag_category'], cols['original_description'], "TW Notes" ]) print('--- Exporting authors with stories from {0}'.format( args.db_database)) if args.archive_type == 'AA': author_table = '{0}.{1}_authors'.format(args.db_database,
class TestEFiction(TestCase): args = testArgs() log = logger("test") sql = Sql(args, log) tags = Tags(args, sql.db, log) efiction = eFiction(args, sql, log, tags) efiction_db = "{0}_efiction".format(args.temp_db_database) @classmethod def setUpClass(cls): cls.efiction.load_database() cls.efiction.copy_tags_to_tags_table(None, "y") @classmethod def tearDownClass(cls): cls.sql.execute("DROP DATABASE IF EXISTS {0}".format(cls.efiction_db)) cls.sql.execute("DROP DATABASE IF EXISTS {0}".format( cls.args.temp_db_database)) def test_load_database(self): cursor = self.sql.cursor test_msg = "original efiction database name from the SQL file should not be created" cursor.execute( "SHOW DATABASES LIKE 'test_efiction_original_database_name_we_dont_want'" ) unwanted_database = cursor.fetchone() self.assertEquals(None, unwanted_database, test_msg) test_msg = "fanfiction_authorfields table should contain the same number of records as in the SQL file" cursor.execute( "SELECT COUNT(*) FROM {0}.fanfiction_authorfields".format( self.efiction_db)) (authorfields, ) = cursor.fetchone() self.assertEqual(3L, authorfields, test_msg) def test_copy_tags_to_tags_table(self): cursor = self.sql.db.cursor(MySQLdb.cursors.DictCursor) cursor.execute("SELECT original_tag FROM {0}.tags".format( self.efiction_db)) tags = list(cursor.fetchall()) unique_tags = set(tag_dict['original_tag'] for tag_dict in tags) self.assertEqual(77L, len(tags), "tags table should be a denormalised table") self.assertIn( u'Václav', unique_tags, "tags table should contain the tags referenced in the story files as a denormalised table" ) def test_copy_to_temp_db(self): self.efiction.copy_to_temp_db(has_coauthors=True) cursor = self.sql.cursor cursor.execute("SELECT * FROM {0}.fanfiction_stories".format( self.efiction_db)) original_stories = cursor.fetchall() cursor.execute("SELECT * FROM {0}.stories".format( self.args.temp_db_database)) stories = cursor.fetchall() cursor.execute("SELECT * FROM {0}.fanfiction_chapters".format( self.efiction_db)) original_chapters = cursor.fetchall() cursor.execute("SELECT * FROM {0}.chapters".format( self.args.temp_db_database)) chapters = cursor.fetchall() cursor.execute("SELECT * FROM {0}.fanfiction_authors".format( self.efiction_db)) original_authors = cursor.fetchall() cursor.execute("SELECT * FROM {0}.authors".format( self.args.temp_db_database)) authors = cursor.fetchall() self.assertEqual( len(original_stories), len(stories), "temp db stories table should contain all the stories from the original efiction table" ) self.assertEqual( len(original_chapters), len(chapters), "temp db chapters table should contain all the chapters from the original efiction table" ) self.assertEqual( len(original_authors), len(authors), "temp db authors table should contain all the authors from the original efiction table" )
def _create_mysql(args, FILES): db = MySQLdb.connect(args.db_host, args.db_user, args.db_password, "") cursor = db.cursor() DATABASE_NAME = args.db_database PREFIX = args.db_table_prefix # Use the database and empty all the tables cursor.execute(u"drop database if exists {0};".format(DATABASE_NAME)) cursor.execute(u"create database {0};".format(DATABASE_NAME)) cursor.execute(u"use {0}".format(DATABASE_NAME)) sql = Sql(args) sql.run_script_from_file('miscellaneous/open-doors-table-schema.sql', args) db.commit() authors = [(FILES[i].get('Author', '').strip(), FILES[i].get('Email', '').lower().strip()) for i in FILES] auth = u"INSERT INTO {0}_authors (name, email) VALUES(%s, %s);".format( PREFIX) cursor.executemany(auth, set(authors)) db.commit() auth = u"SELECT * FROM {0}_authors;".format(PREFIX) cursor.execute(auth) db_authors = cursor.fetchall() stories = [ ( FILES[i].get('Title', '').replace("'", "\\'"), FILES[i].get('Summary', '').replace("'", "\\'"), FILES[i].get('Category', '').replace("'", "\\'"), FILES[i].get('Characters', '').replace("'", "\\'"), datetime.datetime.strptime( FILES[i].get( 'PrintTime', str(datetime.datetime.now().strftime('%m/%d/%y'))), '%m/%d/%y').strftime('%Y-%m-%d'), # Some AA archives have a filetype # FILES[i].get('FileType', 'bookmark'), FILES[i].get('Location', '').replace("'", "\\'"), FILES[i].get('StoryURL', '').replace("'", "\\'"), FILES[i].get('Notes', '').replace("'", "\\'"), FILES[i].get('Pairing', '').replace("'", "\\'"), # might be Pairings in some cases FILES[i].get('Rating', ''), FILES[i].get('Warnings', '').replace("'", "\\'"), FILES[i].get('Author', '').strip(), FILES[i].get('Email', '').lower().strip(), ) for i in FILES ] for (title, summary, category, characters, date, location, url, notes, pairings, rating, warnings, author, email) in set(stories): try: table_name = '{0}_stories'.format(PREFIX) filename = location + '.html' # not true for all AA archives! # For AA archives with external links: # if (filetype != 'bookmark'): # filename = location + '.' + filetype # table_name = '{0}_stories'.format(PREFIX) # else: # filename = url # table_name = '{0}_bookmarks'.format(PREFIX) result = [ element for element in db_authors if element[1] == author and element[2] == email ] authorid = result[0][0] stor = u"""INSERT INTO {0} (fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, authorid) VALUES('due South', '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}');\n""" \ .format(unicode(table_name), unicode(title, 'utf-8'), unicode(summary, 'utf-8'), category, characters, date, filename, unicode(notes, 'utf-8'), pairings, rating, warnings, authorid) cursor.execute(stor) except: print(title, summary, category, characters, date, location, url) raise db.commit()
from shared_python import Args from shared_python.Sql import Sql from automated_archive import aa if __name__ == "__main__": args = Args.args_for_01() sql = Sql(args) # eg: python 01-Load-into-Mysql.py -dh localhost -du root -dt dsa -dd temp_python -a AA -f /path/to/ARCHIVE_DB.pl -o . if args.archive_type == 'AA': print('--- Loading Automated Archive file "{0}" into database "{1}"'. format(args.db_input_file, args.db_database)) aa.clean_and_load_data(args) # eg: python 01-Load-into-Mysql.py -dh localhost -du root -dt sd -dd temp_python -a EF -f /path/to/backup-from-efiction.sql -o . elif args.archive_type == 'EF': print('Loading eFiction file "{0}" into database "{1}"'.format( args.db_input_file, args.db_database)) sql.run_script_from_file(args.db_input_file, database=args.db_database, prefix=args.db_table_prefix)
def _clean_email(author): email = author['email'] if email is None or email == '': email = u'{0}{1}[email protected]'.format(author['name'], args.archive_name)\ .replace(' ', '').replace("'", "") if email.startswith('mailto:'): email = author['email'].replace('mailto:', '') return email if __name__ == "__main__": args_obj = Args() args = args_obj.args_for_05() log = args_obj.logger_with_filename() sql = Sql(args, log) tags = Tags(args, sql.db, log) final = FinalTables(args, sql.db, log) chaps = Chapters(args, sql.db, log) coauthors = {} log.info("Creating destination tables in {0}".format(args.output_database)) table_names = { 'authors': 'authors', 'stories': 'stories', 'chapters': 'chapters', 'story_links': 'story_links' } filter = 'WHERE id NOT IN '
from shared_python import Args from shared_python.Sql import Sql from automated_archive import aa if __name__ == "__main__": args = Args.args_for_01() sql = Sql(args) # eg: python 01-Load-into-Mysql.py -dh localhost -du root -dt dsa -dd temp_python -a AA -f /path/to/ARCHIVE_DB.pl -o . if args.archive_type == 'AA': print('\nLoading Automated Archive file "{0}" into database "{1}"'. format(args.db_input_file, args.temp_db_database)) aa.clean_and_load_data(args) # eg: python 01-Load-into-Mysql.py -dh localhost -du root -dt sd -dd temp_python -a EF -f /path/to/backup-from-efiction.sql -o . elif args.archive_type == 'EF': print('\nLoading eFiction file "{0}" into database "{1}"'.format( args.db_input_file, args.temp_db_database)) sql.run_script_from_file(args.db_input_file, database=args.temp_db_database, prefix=args.db_table_prefix, initial_load=True) print('\n')