def hydrate_tags_table(self, col, lookup_data, lookup_ids=False): self.cursor.execute( 'SELECT DISTINCT original_tag from tags WHERE original_column="{0}"' .format(col)) results = self.cursor.fetchall() total = len(results) cur = 0 print "{0} tags for column '{1}'".format(total, col) for tag_row in results: cur = Common.print_progress(cur, total) # Get tag data parent = '' extra_column = ', {0} as description'.format( lookup_data['extra_column'] ) if 'extra_column' in lookup_data else '' lookup_column = ', {0} as parent'.format( lookup_data['lookup_field'] ) if 'lookup_field' in lookup_data else '' matching_field = lookup_data[ 'id_name'] if lookup_ids else lookup_data['field_name'] dict_cursor = self.db.cursor(MySQLdb.cursors.DictCursor) dict_cursor.execute(""" SELECT {0}, {1}{2}{3} FROM {4} WHERE {5}='{6}' """.format(lookup_data['id_name'], lookup_data['field_name'], lookup_column, extra_column, lookup_data['table_name'], matching_field, tag_row[0])) tag = dict_cursor.fetchone() if tag is not None: # Get parent data if 'lookup_field' in lookup_data: self.cursor.execute( "SELECT {0} FROM {1} WHERE {2}='{3}'".format( lookup_data['lookup_table_field'], lookup_data['lookup_table'], lookup_data['lookup_id'], tag['parent'])) result = self.cursor.fetchone() parent = result[0] if result is not None else '' # Update the table description = tag['description'] if 'description' in tag else '' self.hydrate_tag_row(tag_id=tag[lookup_data['id_name']], tag_to_look_up=tag_row[0], tag=tag[lookup_data['field_name']], table=lookup_data['table_name'], col=col, parent=parent, description=description)
def _create_mysql(args, FILES, log): db = connect(args.db_host, args.db_user, args.db_password, "") cursor = db.cursor() DATABASE_NAME = args.temp_db_database # Use the database and empty all the tables cursor.execute(u"drop database if exists {0};".format(DATABASE_NAME)) cursor.execute(u"create database {0};".format(DATABASE_NAME)) cursor.execute(u"use {0}".format(DATABASE_NAME)) sql = Sql(args) sql.run_script_from_file('shared_python/create-open-doors-tables.sql', DATABASE_NAME) db.commit() authors = [(FILES[i].get('Author', '').strip(), FILES[i].get('Email', FILES[i].get('EmailAuthor', '')).lower().strip()) for i in FILES] auth = u"INSERT INTO authors (name, email) VALUES(%s, %s);" cursor.executemany(auth, set(authors)) db.commit() # Authors auth = u"SELECT * FROM authors;" cursor.execute(auth) db_authors = cursor.fetchall() # Stories and bookmarks stories = [( i, FILES[i].get('Title', '').replace("'", "\\'"), FILES[i].get('Summary', '').replace("'", "\\'"), _extract_tags(args, FILES[i]), _extract_characters(args, FILES[i]), datetime.datetime.strptime( FILES[i].get( 'PrintTime', FILES[i].get( 'DatePrint', FILES[i].get( 'Date', str(datetime.datetime.now().strftime('%m/%d/%y'))))), '%m/%d/%y').strftime('%Y-%m-%d'), FILES[i].get('Location', '').replace("'", "\\'"), FILES[i].get('LocationURL', FILES[i].get('StoryURL', '')).replace("'", "\\'"), FILES[i].get('Notes', '').replace("'", "\\'"), _extract_relationships(args, FILES[i]), FILES[i].get('Rating', ''), FILES[i].get('Warnings', '').replace("'", "\\'"), FILES[i].get('Author', '').strip(), FILES[i].get('Email', FILES[i].get('EmailAuthor', '')).lower().strip(), FILES[i].get('FileType', args.chapters_file_extensions) if not _is_external(FILES[i]) else 'bookmark', _extract_fandoms(args, FILES[i]), ) for i in FILES] cur = 0 total = len(FILES) for (original_id, title, summary, tags, characters, date, location, url, notes, pairings, rating, warnings, author, email, filetype, fandoms) in set(stories): cur = Common.print_progress(cur, total) try: # For AA archives with external links: if filetype != 'bookmark': if location == '': filename = url else: filename = location + '.' + filetype table_name = 'stories' else: filename = url table_name = 'bookmarks' # Clean up fandoms and add default fandom if it exists final_fandoms = fandoms.replace("'", r"\'") if args.default_fandom is not None: if final_fandoms == '' or final_fandoms == args.default_fandom: final_fandoms = args.default_fandom else: final_fandoms = args.default_fandom + ', ' + final_fandoms result = [ element for element in db_authors if element[1] == author and element[2] == email ] authorid = result[0][0] stor = u""" INSERT INTO {0} (id, fandoms, title, summary, tags, characters, date, url, notes, relationships, rating, warnings, author_id) VALUES({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}', '{13}');\n""" \ .format(table_name, original_id, final_fandoms.replace(r"\\", "\\"), title.replace(r"\\", "\\"), summary, tags, characters, date, filename, notes, pairings, rating, warnings, authorid) cursor.execute(stor) except: log.error("table name: {0}\noriginal id: {1}\nfinal fandoms: '{2}'\ntitle: '{3}'\nsummary: '{4}'\ntags: '{5}'" \ "\ncharacters: '{6}'\ndate: '{7}'\nfilename: '{8}'\nnotes: '{9}'\npairings: '{10}'\nrating: '{11}'" \ "\nwarnings: '{12}'\nauthor id: '{13}'"\ .format(table_name, original_id, final_fandoms, title, summary, tags, characters, date, filename, notes, pairings, rating, warnings, authorid)) raise db.commit()
def _gather_and_dedupe(self, chapters_path, extensions, has_ids=False): self.log.info("\nFinding chapters and identifying duplicates") extensions = re.split(r", ?", extensions) story_folder = os.walk(chapters_path) file_paths = {} duplicate_chapters = {} has_duplicates = False messages = [] sql_messages = [] cur = 0 for root, _, filenames in story_folder: total = len(filenames) Common.print_progress(cur, total) for filename in filenames: if has_ids and self._ends_with(filename, extensions): file_path = os.path.join(root, filename) cid = os.path.splitext(filename)[0] if cid not in file_paths.keys(): file_paths[cid] = file_path else: duplicate_folder = os.path.split( os.path.split(file_path)[0])[1] messages.append(file_path + " is a duplicate of " + file_paths[cid]) sql_messages.append( "SELECT * FROM chapters WHERE id = {1}".format( cid)) duplicate_chapters[cid] = [{ 'folder_name': os.path.split(os.path.split( file_paths[cid])[0])[1], 'filename': filename, 'path': file_paths[cid] }, { 'folder_name': duplicate_folder, 'filename': filename, 'path': file_path }] has_duplicates = True else: file_path = os.path.join(root, filename) name = os.path.splitext(filename)[0] file_paths[name] = file_path if has_duplicates: self.log.warn('\n'.join(messages + sql_messages)) self.log.warn(duplicate_chapters) folder_name_type = raw_input( "Resolving duplicates: pick the type of the folder name under {0} " "\n1 = author id\n2 = author name\n3 = skip duplicates check\n" .format(chapters_path)) if folder_name_type == '1': for cid, duplicate in duplicate_chapters.items(): # look up the author id and add that one to the file_names list self.cursor.execute( "SELECT author_id FROM chapters WHERE id = {1}".format( cid)) sql_author_id = self.cursor.fetchall() if len(sql_author_id) > 0: author_id = sql_author_id[0][0] file_paths[cid] = [ dc['path'] for dc in duplicate_chapters[cid] if dc['folder_name'] == str(author_id) ][0] elif folder_name_type == '2': self.log.warn("Not implemented") return file_paths
def populate_chapters(self, folder=None, extensions=None): if folder is None: folder = self.args.chapters_path if extensions is None: extensions = self.args.chapters_file_extensions self.log.info("Processing chapters...") filenames_are_ids = raw_input( "\nChapter file names are chapter ids? Y/N\n") has_ids = True if str.lower(filenames_are_ids) == 'y' else False file_paths = self._gather_and_dedupe(folder, extensions, has_ids) char_encoding = raw_input( "\n\nImporting chapters: pick character encoding (check for curly quotes):\n" "1 = Windows 1252\nenter = UTF-8\n") if char_encoding == '1': char_encoding = 'cp1252' else: char_encoding = 'utf8' cur = 0 total = len(file_paths) if has_ids: for cid, chapter_path in file_paths.items(): with codecs.open(chapter_path, 'r', encoding=char_encoding) as c: try: cur = Common.print_progress(cur, total) file_contents = c.read() query = "UPDATE {0}.chapters SET text=%s WHERE id=%s".format( self.args.output_database) self.cursor.execute(query, (file_contents, int(cid))) self.db.commit() except Exception as e: self.log.error( "Error = chapter id: {0} - chapter: {1}\n{2}". format(cid, chapter_path, str(e))) finally: pass else: for _, chapter_path in file_paths.items(): path = chapter_path.replace(self.args.chapters_path, '')[1:] with codecs.open(chapter_path, 'r', encoding=char_encoding) as c: try: cur = Common.print_progress(cur, total) file_contents = c.read() query = "UPDATE {0}.chapters SET text=%s WHERE url=%s and text=''".format( self.args.output_database) self.cursor.execute(query, (file_contents, path)) self.db.commit() except Exception as e: self.log.error( "Error = chapter id: {0} - chapter: {1}\n{2}". format(path, chapter_path, str(e))) finally: pass self.db.close()
def _gather_and_dedupe(self, chapters_path, extensions): extensions = re.split(r", ?", extensions) story_folder = os.walk(chapters_path) file_paths = {} duplicate_chapters = {} error = False messages = [] sql_messages = [] cur = 0 for root, _, filenames in story_folder: total = len(filenames) Common.print_progress(cur, total) for filename in filenames: if self._ends_with(filename, extensions): file_path = os.path.join(root, filename) cid = os.path.splitext(filename)[0] if cid not in file_paths.keys(): file_paths[cid] = file_path else: duplicate_folder = os.path.split( os.path.split(file_path)[0])[1] messages.append(file_path + " is a duplicate of " + file_paths[cid]) sql_messages.append( "SELECT * FROM {0}_chapters WHERE id = {1}".format( self.args.db_table_prefix, cid)) duplicate_chapters[cid] = [{ 'folder_name': os.path.split(os.path.split( file_paths[cid])[0])[1], 'filename': filename, 'path': file_paths[cid] }, { 'folder_name': duplicate_folder, 'filename': filename, 'path': file_path }] error = True if error: print '\n'.join(messages + sql_messages) print duplicate_chapters folder_name_type = raw_input( "Resolving duplicates: pick the type of the folder name under {0} \n1 = author id\n2 = author name\n" .format(chapters_path)) if folder_name_type == '1': for cid, duplicate in duplicate_chapters.items(): # look up the author id and add that one to the file_names list self.cursor.execute( "SELECT authorid FROM {0}_chapters WHERE id = {1}". format(self.args.db_table_prefix, cid)) sql_author_id = self.cursor.fetchall() if len(sql_author_id) > 0: author_id = sql_author_id[0][0] file_paths[cid] = [ dc['path'] for dc in duplicate_chapters[cid] if dc['folder_name'] == str(author_id) ][0] return file_paths