def charge(self, input_voltage=0.0): if self.check_voltage(input_voltage): for percentage in range(1, 11): self.print_charge_percentage(percentage) # when charge is complete clean_line() print('Charge is complete')
def print_charge_percentage(self, percentage): _pretty_battery = list('[ ]') for i in range(1, percentage): _pretty_battery[i] = '=' sleep(0.2) if percentage > 1: clean_line() print('Charging : ' + "".join(_pretty_battery) + '>')
def find_steam_info(game_dir): """Determine the steam ID and game name of this folder, if it has one. This only works on Source games! """ game_id = -1 name = "ERR" found_name = False found_id = False for folder in os.listdir(game_dir): info_path = os.path.join(game_dir, folder, 'gameinfo.txt') if os.path.isfile(info_path): with open(info_path) as file: for line in file: clean_line = utils.clean_line(line).replace('\t', ' ') if not found_id and 'steamappid' in clean_line.casefold(): raw_id = clean_line.casefold().replace( 'steamappid', '').strip() try: game_id = int(raw_id) except ValueError: pass elif not found_name and 'game ' in clean_line.casefold(): found_name = True ind = clean_line.casefold().rfind('game') + 4 name = clean_line[ind:].strip().strip('"') if found_name and found_id: break if found_name and found_id: break return game_id, name
def parse(posfile, propfile, path): "Parse through the given palette file to get all data." props = Property.parse(propfile, path + ':properties.txt') name = "Unnamed" opts = {} for option in props: if option.name == "name": name = option.value else: opts[option.name.casefold()] = option.value pos = [] for dirty_line in posfile: line = utils.clean_line(dirty_line) if line: # Lines follow the form # "ITEM_BUTTON_FLOOR", 2 # for subtype 3 of the button if line.startswith('"'): val = line.split('",') if len(val) == 2: pos.append(( val[0][1:], # Item ID int(val[1].strip()), # Item subtype )) else: print("Malformed row '"+line+"'!") return None return Palette(name, pos, opts, filename=path)
def segment_target_translation(matrix, target): if not target: matrix = [list(i) for i in zip(*matrix)] finalString = "" translation = [] lastCol = -1 last_alignment = "" for i in range(1, len(matrix)): #for each element col = get_max_prob_col(i, matrix) aligned_word = matrix[0][col] if lastCol == -1: #first character finalString += matrix[i][0] #put the character in the beginning last_alignment = aligned_word elif lastCol == col: # if the current character and the last one are not separated finalString += matrix[i][0] else: finalString += " " + matrix[i][0] translation.append(last_alignment) last_alignment = aligned_word lastCol = col translation.append(last_alignment) finalString = finalString.replace(" ", " ") if finalString[-1] == " ": finalString = finalString[:-1] if finalString[0] == " ": finalString = finalString[1:] discovered_words = finalString.split(" ") assert len(discovered_words) == len(translation) if discovered_words[ -1] == utils.EOS_symbol: #if segmented the EOS symbol, we need to remove it and its aligned translation discovered_words = discovered_words[:-1] translation = translation[:-1] finalString = utils.clean_line(" ".join(discovered_words)) return finalString, " ".join(translation)
def parse(posfile, propfile, path): "Parse through the given palette file to get all data." props = Property.parse(propfile, path + ':properties.txt') name = "Unnamed" opts = {} for option in props: if option.name == "name": name = option.value else: opts[option.name.casefold()] = option.value pos = [] for dirty_line in posfile: line = utils.clean_line(dirty_line) if line: # Lines follow the form # "ITEM_BUTTON_FLOOR", 2 # for subtype 3 of the button if line.startswith('"'): val = line.split('",') if len(val) == 2: pos.append(( val[0][1:], # Item ID int(val[1].strip()), # Item subtype )) else: LOGGER.warning('Malformed row "{}"!', line) return None return Palette(name, pos, opts, filename=path)
def parse(cls, data): conf = data.info.find_key('Config', '') mats = [prop.value for prop in data.info.find_all('AddIfMat')] if conf.has_children(): # Allow having a child block to define packlists inline files = [prop.value for prop in conf] else: path = 'pack/' + conf.value + '.cfg' try: with data.zip_file.open(path) as f: # Each line is a file to pack. # Skip blank lines, strip whitespace, and # alow // comments. files = [] for line in f: line = utils.clean_line(line) if line: files.append(line) except KeyError as ex: raise FileNotFoundError('"{}:{}" not in zip!'.format( data.id, path, )) from ex return cls( data.id, files, mats, )
def scrape_dissertation_page(page, alumni_list, faculty_list): if page['text'] == 'ERROR': return { 'alumni_match': None, 'alumni_compared_line': None, 'faculty_matches': None } page = [ line for line in re.split('; |, |\*|\n', page['text']) if line not in ['', None] ] cleaned_lines = [] for line in page: line = strip_page_stop_words(line) if len(line) > 0: line = reduce(lambda a, b: a + ' ' + b, line) cleaned_lines.append(line) page = cleaned_lines committee_index = next( (i for i, line in enumerate(page) if 'committee' in clean_line(line)), 0) alumni_matches = get_top_matches(page, alumni_list, normalize_func=lambda i: i) faculty_matches = get_top_matches(page[committee_index + 1:], faculty_list, normalize_func=lambda i: i) alumni_matches = [tup for tup in alumni_matches if tup[2] >= 0.90] faculty_matches = [tup for tup in faculty_matches if tup[2] >= 0.90] if len(alumni_matches): compared_line = reduce( lambda a, b: str(a) + ' ' + str(b), [tup[0] for tup in alumni_matches if tup[2] >= 0.90]) alumni_matches = [tup[1] for tup in alumni_matches] else: compared_line = None alumni_matches = None if len(faculty_matches): faculty_matches = [ [tup[1], 'Chair'] if (i == 0) or ('chair' in tup[0].lower()) else [tup[1], 'Non-chair'] for i, tup in enumerate(faculty_matches) ] else: faculty_matches = None return { 'alumni_match': alumni_matches, 'alumni_compared_line': compared_line, 'faculty_matches': faculty_matches }
def edit_gameinfo(self, add_line=False): """Modify all gameinfo.txt files to add or remove our line. Add_line determines if we are adding or removing it. """ for folder in self.dlc_priority(): info_path = os.path.join(self.root, folder, 'gameinfo.txt') if os.path.isfile(info_path): with open(info_path) as file: data = list(file) for line_num, line in reversed(list(enumerate(data))): clean_line = utils.clean_line(line) if add_line: if clean_line == GAMEINFO_LINE: break # Already added! elif '|gameinfo_path|' in clean_line: LOGGER.debug( "Adding gameinfo hook to {}", info_path, ) # Match the line's indentation data.insert( line_num+1, utils.get_indent(line) + GAMEINFO_LINE + '\n', ) break else: if clean_line == GAMEINFO_LINE: LOGGER.debug( "Removing gameinfo hook from {}", info_path ) data.pop(line_num) break else: if add_line: LOGGER.warning( 'Failed editing "{}" to add our special folder!', info_path, ) continue with open(info_path, 'w') as file: for line in data: file.write(line) if not add_line: # Restore the original files! for name, file, ext in FILES_TO_BACKUP: item_path = self.abs_path(file + ext) backup_path = self.abs_path(file + '_original' + ext) old_version = self.abs_path(file + '_styles' + ext) if os.path.isfile(old_version): LOGGER.info('Restoring Stylechanger version of "{}"!', name) shutil.copy(old_version, item_path) elif os.path.isfile(backup_path): LOGGER.info('Restoring original "{}"!', name) shutil.move(backup_path, item_path) self.clear_cache()
def main(): global NUM_TOP_TOPICS check_start('./test_doc_guess_algorithm.py <folder> ' + '<output> <num_topics>', 4) # Input naming folder = sys.argv[1] output = sys.argv[2] num_topics = int(sys.argv[3]) topic_order_count = 0 topic_feeling_count = 0 topic_offset_count = 0 # File allocation format_file = folder + '/initial.formatted' vocab_file = folder + '/initial.vocab' info_file = folder + '/out/final.other' with open(format_file) as data_file: format_lines = data_file.readlines() with open(info_file) as data_file: info_lines = data_file.readlines() NUM_TOP_TOPICS = int(info_lines[0].split(' ')[1]) known_topics = get_known_topics(folder) print(str(known_topics[0])) word_dict = get_word_dictionary(folder, vocab_file) hierarchy_struct = get_hierarchy_struct(folder) # Cycle through top-level documents count = 0 for index in range(len(format_lines)): line = format_lines[index] line = line.strip() # known_topics = get_known_topics() weights = get_document_values(index, hierarchy_struct) contents = line.split('|~|')[3] contents = clean_line(contents) classed_doc = classify(contents, weights, word_dict) classed_array = get_classed_array(classed_doc) if index % 1000 == 0 and index != 0: print('\tTopics correct position ' + '%d: %s' % (index, str(topic_order_count / index))) if compare_topics_order(classed_array, known_topics[index], 1): topic_order_count += 1 count += 1 print('\tTopics correct position %d: %s' % (count, str(topic_order_count / count)))
def parse(cls, data): conf = data.info.find_key('Config', '') mats = [ prop.value for prop in data.info.find_all('AddIfMat') ] if conf.has_children(): # Allow having a child block to define packlists inline files = [ prop.value for prop in conf ] else: path = 'pack/' + conf.value + '.cfg' try: with data.zip_file.open(path) as f: # Each line is a file to pack. # Skip blank lines, strip whitespace, and # alow // comments. files = [] for line in f: line = utils.clean_line(line) if line: files.append(line) except KeyError as ex: raise FileNotFoundError( '"{}:{}" not in zip!'.format( data.id, path, ) ) from ex if CHECK_PACKFILE_CORRECTNESS: # Use normpath so sep differences are ignored, plus case. zip_files = { os.path.normpath(file).casefold() for file in zip_names(data.zip_file) if file.startswith('resources') } for file in files: # Check to make sure the files exist... file = os.path.join('resources', os.path.normpath(file)).casefold() if file not in zip_files: LOGGER.warning('Warning: "{file}" not in zip! ({pak_id})', file=file, pak_id=data.pak_id, ) return cls( data.id, files, mats, )
def edit_gameinfo(self, add_line=False): """Modify all gameinfo.txt files to add or remove our line. Add_line determines if we are adding or removing it. """ for folder in self.dlc_priority(): info_path = os.path.join(self.root, folder, 'gameinfo.txt') if os.path.isfile(info_path): with open(info_path) as file: data = list(file) for line_num, line in reversed(list(enumerate(data))): clean_line = utils.clean_line(line) if add_line: if clean_line == GAMEINFO_LINE: break # Already added! elif '|gameinfo_path|' in clean_line: print("Adding gameinfo hook to " + info_path) # Match the line's indentation data.insert( line_num + 1, utils.get_indent(line) + GAMEINFO_LINE + '\n', ) break else: if clean_line == GAMEINFO_LINE: print("Removing gameinfo hook from " + info_path) data.pop(line_num) break else: if add_line: print('Failed editing "' + info_path + '" to add our special folder!') continue with open(info_path, 'w') as file: for line in data: file.write(line) if not add_line: # Restore the original files! for name, file, ext in FILES_TO_BACKUP: item_path = self.abs_path(file + ext) backup_path = self.abs_path(file + '_original' + ext) old_version = self.abs_path(file + '_styles' + ext) if os.path.isfile(old_version): print("Restoring Stylechanger version of " + name + "!") shutil.copy(old_version, item_path) elif os.path.isfile(backup_path): print("Restoring original " + name + "!") shutil.move(backup_path, item_path) self.clear_cache()
def generate_inst_dict(instance_file_path, lines_to_read, vertex_set=None, print_details=False): """ :param instance_file_path: path to the instance file used to generate the dictionary; :param lines_to_read: number of lines to read in the file; if <= 0, read the entire file; :param print_details: boolean, if True print details about the state of the processing; :param vertex_set: if present, use a set of vertices as starting point, with all the vertices being "owl#Thing"; :return: dictionary that contains for each entity its instance type; """ # Dictionary where the entity types are stored; instance_dict = {} current_line = 0 start_time = time.time() with utils.read_compressed(instance_file_path) as infile: for line in infile: # Skip the header line, and skip commented-out lines; if current_line != 0 and line[0] != "#": # Clean the line and split it in 4; keep only the first 3; triple = utils.clean_line(line, ignore_literals=True, obtain_resource_manually=True) entity_name, _, entity_type = triple # Add to the dict; # Note: data are dirty, some entities appears more than once # with different types. # As this problem doesn't occur often, the last type that appears is kept. # If the entity is already present, with type "owl#Thing", # the type is overwritten with something more specific. if len(entity_name) > 0 and len(entity_type) > 0: if vertex_set is None or entity_name in vertex_set: if entity_name not in instance_dict or\ (entity_name in instance_dict and instance_dict[entity_name] == "owl#Thing"): instance_dict[entity_name] = entity_type current_line += 1 if not current_line % 100000 and print_details: print("LINES READ: {} -- ELAPSED TIME: {:.2f} seconds"\ .format(current_line, time.time() - start_time)) # Stop reading if enough lines have been read; if lines_to_read > 0 and current_line > lines_to_read: break # Add all the other vertices; for v in vertex_set: if v not in instance_dict: instance_dict[v] = "owl#Thing" return instance_dict
def process_docx(file_path: Path): """ Extracts text from .docx files Args: file_path(Path) : Path object that contains the file_path of the .docx file Returns: list : The sentences extracted from the file """ doc = Document(file_path) sentences = [] for para in doc.paragraphs: for line in para.text.split("."): line = clean_line(line) if line: sentences.append(line) return sentences
def process_doc(file_path: Path): """ Extracts text from .doc files Args: file_path(Path) : Path object that contains the file_path of the .doc file Returns: list : The sentences extracted from the file """ try: p = sp.run(["catdoc", str(file_path)], capture_output=True) output = p.stdout.decode() sentences = [clean_line(line) for line in output.split("\n\n") if line] return sentences except FileNotFoundError as e: print("Unable to process", file_path) print(e.strerror) return []
def process_pdf(file_path: Path): """ Extracts text from .pdf files Args: file_path(Path) : Path object that contains the file_path of the .pdf file Returns: list : The sentences extracted from the file """ sentences = [] with open(file_path, "rb") as f: resmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resmgr, laparams=laparams) interpreter = PDFPageInterpreter(resmgr, device) for page in PDFPage.get_pages(f, caching=True, check_extractable=True): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox): sentences.append(clean_line(lt_obj.get_text())) return sentences
def segment_target(matrix, target): if not target: matrix = [list(i) for i in zip(*matrix)] finalString = "" lastCol = -1 for i in range(1, len(matrix)): #for each element col = get_max_prob_col(i, matrix) if lastCol == -1: #first character finalString += matrix[i][0] #put the character in the beginning elif lastCol == col: # if the current character and the last one are not separated finalString += matrix[i][0] else: finalString += " " + matrix[i][0] lastCol = col finalString = finalString.replace(" ", " ") if finalString[-1] == " ": finalString = finalString[:-1] if finalString[0] == " ": finalString = finalString[1:] return utils.clean_line(finalString)
def parse(cls, data): conf = data.info.find_key('Config', '') mats = [ prop.value for prop in data.info.find_all('AddIfMat') ] if conf.has_children(): # Allow having a child block to define packlists inline files = [ prop.value for prop in conf ] else: path = 'pack/' + conf.value + '.cfg' try: with data.zip_file.open(path) as f: # Each line is a file to pack. # Skip blank lines, strip whitespace, and # alow // comments. files = [] for line in f: line = utils.clean_line(line) if line: files.append(line) except KeyError as ex: raise FileNotFoundError( '"{}:{}" not in zip!'.format( data.id, path, ) ) from ex return cls( data.id, files, mats, )
def main(): check_start('./test_doc_guess_algorithm.py <folder> <output> <num_topics>', 4) # Input naming folder = sys.argv[1] output = sys.argv[2] num_topics = int(sys.argv[3]) topic_order_count = [0] * num_topics topic_feeling_count = [0] * num_topics topic_offset_count = [0] * num_topics # File allocation format_file = folder + '/initial.formatted' vocab_file = folder + '/initial.vocab' word_assignment_file = folder + '/out/word-assignments.dat' gamma_file = folder + '/out/final.gamma' # Read in required data with open(vocab_file) as data_file: vocab_lines = data_file.readlines() with open(word_assignment_file) as data_file: word_assignment_lines = data_file.readlines() with open(format_file) as data_file: format_lines = data_file.readlines() with open(gamma_file) as data_file: gamma_lines = data_file.readlines() vocab_details = get_vocab_details(vocab_lines, word_assignment_lines) topic_lines = get_document_topics(gamma_lines) vocab_index = get_vocab_index(vocab_lines) num_docs = len(format_lines) for index in range(len(format_lines)): line = format_lines[index] line = line.strip() known_topics = topic_lines[index].strip().split(' ') document_contents = line.split('|~|')[3] document_contents = clean_line(document_contents) classed_doc = classify(document_contents, vocab_details, vocab_index) if len(classed_doc) == 0: print(document_contents) continue if index % 1000 == 0 and index != 0: print('Snapshot: ' + str(index)) for i in range(1, num_topics): print('\tTopics correct position %d: %s' % (i, str(topic_order_count[i] / index))) print('\tTopics within range %d: %s' % (i, str(topic_feeling_count[i] / index))) print('\tTopics within offset %d: %s' % (i, str(topic_offset_count[i] / index))) for i in range(1, len(classed_doc)): if compare_topics_order(classed_doc, known_topics, i): topic_order_count[i] += 1 if compare_topic_slice(classed_doc, known_topics, i): topic_feeling_count[i] += 1 if compare_position_offset(classed_doc, known_topics, i): topic_offset_count[i] += 1 out_ptr = open(output, 'w') out_ptr.write('Number of documents: ' + str(num_docs) + '\n') for i in range(num_topics): out_ptr.write('Topics correct position %d: %s\n' % (i, str(topic_order_count[i] / num_docs))) out_ptr.write('Topics within range %d: %s\n' % (i, str(topic_feeling_count[i] / num_docs))) out_ptr.write('Topics within offset %d: %s\n' % (i, str(topic_offset_count[i] / num_docs)))
def write_graph(input_path, output_path_v, output_path_e, instance_dict=None, lines_to_read=0, vertex_dict=None, print_details=True, instance_dict_vertices_only=True, ignore_literals=True, add_orphan_vertices=False, add_triples=False, track_vertex_pairs=False, track_all_edges=False, edge_set=None, skip_existing_pairs=False): """ :param input_path: a text file containing a list of RDF triples; :param output_path_v: the file path where the vertices are written; :param output_path_e: the file path where the edges are written; :param graph_dict_path: path to where the vertex id dictionary is stored, as a pickled file; :param instance_dict: the dictionary that contains the instance type of each vertex; :param lines_to_read: the maximum number of triples to be read; if < 1 read the entire file; :param add_orphan_vertices: if True, add to the graph all the vertices in instance dict; :param print_details: boolean, if True print details about the state of the processing; :param add_triples: if True, return a dictionary that contains all the triples that have been addded to the graph; :param track_vertex_pairs: if True, store the vertex pairs that are added as edges, in a directed way. :param track_all_edges: if True, store all the vertex pairs, otherwise store only redirects/disambiguations. :param edge_set: if present, add vertex pairs to this set; :param skip_existing_pairs: if True, don't add the edges that are present in the edge_set; :return: tuple that contains the number of lines that have been read, and the number of vertices in the graph; Write a list of RDF triples into a PGX-compatible graph, written in EDGELIST format. """ # Read the input file line-by-line, and add the information to a dictionary # that represents the graph; # Write the graph as EDGELIST text files; # A dictionary in which each entity is given a unique id; # If specified, load an existing one; if vertex_dict is not None: write_mode = "a+" else: vertex_dict = {} write_mode = "w+" # Time the execution; start_time = time.time() triple_dict = {} if edge_set is None: edge_set = set() edge_filter = ["wikiPageRedirects", "wikiPageDisambiguates"] current_line = 0 edge_count = 0 skipped_self_loops = 0 # ID of the next vertex to be added; # using len(graph) + 1 allows incremental additions; vertex_id = len(vertex_dict) + 1 with open(output_path_v, write_mode, encoding="utf-8") as outfile_v: with open(output_path_e, write_mode, encoding="utf-8") as outfile_e: with utils.read_compressed(input_path) as infile: for line in infile: # Skip the header line, and skip commented-out lines; if current_line != 0 and line[0] != "#": # Create a triple from the given line; triple = utils.clean_line( line, ignore_literals, obtain_resource_manually=True) source, relation, destination = triple if source == destination: skipped_self_loops += 1 # It is possible to skip the current edge if its vertices are not in the list of vertices; if not instance_dict_vertices_only or (instance_dict is None) \ or ((source in instance_dict) and (destination in instance_dict)): # Add the triple to the graph; # Also add a unique vertex_id to each vertex that is added; # The third element is processed first, # so that we have its unique ID if we have to add a new edge; skip_dest = False # Check if the current triple should be skipped; if (source == "" or relation == ""): skip_dest = True if ignore_literals and destination == "": skip_dest = True # Add source and destination vertices; # Add the source vertex; if source not in vertex_dict: # Keep track of the vertex with a unique ID vertex_dict[source] = vertex_id # Write the name of the entity and its type; # each line has the form "entity_name, {instance_type}"; outfile_v.write('"{}" * "{}"\n'.format( source, get_instance_type(source, instance_dict))) vertex_id += 1 # Add the destination vertex; if (not skip_dest) and (destination not in vertex_dict): # Keep track of the vertex with a unique ID vertex_dict[destination] = vertex_id # Write a new vertex like before; outfile_v.write('"{}" * "{}"\n'.format( destination, get_instance_type(destination, instance_dict))) vertex_id += 1 # Add a new edge; # Skip self-loops; if not skip_dest and (source != destination): if add_triples: if source in triple_dict: triple_dict[source] += [triple] else: triple_dict[source] = [triple] # Write a new edge; if not (skip_existing_pairs and (vertex_dict[source], vertex_dict[destination]) in edge_set): outfile_e.write('"{}" "{}" "{}"\n'\ .format(source, destination, relation)) edge_count += 1 # Keep track of the pairs (source, destination), after writing the current edge # (otherwise no edge is added!); if track_vertex_pairs: if not track_all_edges and relation in edge_filter: edge_set.add( (vertex_dict[source], vertex_dict[destination])) else: edge_set.add( (vertex_dict[source], vertex_dict[destination])) current_line += 1 if not current_line % 100000 and print_details: print("\tLINES READ: {} -- TIME: {:.2f} seconds \ -- TOT. VERTICES: {} -- EDGES ADDED: {}".format( current_line, time.time() - start_time, vertex_id, edge_count)) # Stop reading if enough lines have been read; if lines_to_read > 0 and current_line > lines_to_read: break # Add all the remaining vertices; if add_orphan_vertices and instance_dict is not None: additional_vertices = 0 for v in instance_dict: if v not in vertex_dict: vertex_dict[v] = vertex_id outfile_v.write('"{}" * "{}"\n'.format( v, get_instance_type(v, instance_dict))) vertex_id += 1 additional_vertices += 1 if print_details: print("ADDITIONAL VERTICES FROM INSTANCE DICT: {}".format( additional_vertices)) print("SKIPPED SELF LOOPS: {}".format(skipped_self_loops)) return (current_line - 1, triple_dict, edge_set, vertex_dict, edge_count)
def edit_gameinfo(self, add_line=False): """Modify all gameinfo.txt files to add or remove our line. Add_line determines if we are adding or removing it. """ if self.is_modded() == add_line: # It's already in the correct state! return for folder in self.dlc_priority(): info_path = os.path.join(self.root, folder, 'gameinfo.txt') if os.path.isfile(info_path): with open(info_path) as file: data = list(file) for line_num, line in reversed(list(enumerate(data))): clean_line = utils.clean_line(line) if add_line: if clean_line == GAMEINFO_LINE: break # Already added! elif '|gameinfo_path|' in clean_line: print("Adding gameinfo hook to " + info_path) # Match the line's indentation data.insert( line_num+1, utils.get_indent(line) + GAMEINFO_LINE + '\n', ) break else: if clean_line == GAMEINFO_LINE: print("Removing gameinfo hook from " + info_path) data.pop(line_num) break else: if add_line: print( 'Failed editing "' + info_path + '" to add our special folder!' ) continue with open(info_path, 'w') as file: for line in data: file.write(line) if add_line: with open(self.abs_path('BEE2_EDIT_FLAG'), 'w') as file: file.write('') else: os.remove(self.abs_path('BEE2_EDIT_FLAG')) # Restore the original files! for name, file, ext in FILES_TO_BACKUP: item_path = self.abs_path(file + ext) backup_path = self.abs_path(file + '_original' + ext) old_version = self.abs_path(file + '_styles' + ext) if os.path.isfile(old_version): print("Restoring Stylechanger version of " + name + "!") shutil.copy(old_version, item_path) elif os.path.isfile(backup_path): print("Restoring original " + name + "!") shutil.move(backup_path, item_path) self.clear_cache()
def gen_train_samples(self): self.original2corp() sample_set = {} np.random.seed(8) # 加载数据,以文本为单位 important_tokens = [] text = open(FLAGS.corpus_file, encoding="utf8").readlines()[:10] print("select important tokens...") for e in tqdm(text, total=len(text)): tmp = self.tokenizer.select_important_tokens(clean_line(e.strip())) if len(tmp) < 10: continue important_tokens.append(tmp) # 采样正负样本,同一个文本中的词为正样本,不同文本中的词为负样本 print("sample(1+k negative) train and valid set...") num_neg = min(len(important_tokens) - 1, MAX_NUM_NEG) for cur_index, cur_ele in tqdm(enumerate(important_tokens), total=len(important_tokens)): np.random.shuffle(cur_ele) cut_index = int(len(cur_ele) / 3) lhs, rhs = cur_ele[:cut_index], cur_ele[cut_index:] for word_index, word in enumerate(lhs): if word in sample_set: continue positive_entity = rhs[word_index] # 正样本 # 负采样 negative_entitys, negs = [], [] negative_indexes = [ i for i in range(len(important_tokens)) if i != cur_index ] random.shuffle(negative_indexes) for e in negative_indexes: if (len(negs) >= num_neg): break if word in important_tokens[ e] or positive_entity in important_tokens[e]: continue negs.append(e) for neg_index in negs: while True: neg_tmp = random.sample(important_tokens[neg_index], 1)[0] if neg_tmp != word and neg_tmp not in negative_entitys: break negative_entitys.append(neg_tmp) assert len(negative_entitys) == num_neg # 采样数少的情况下进行填充 #if len(negative_entitys) < num_neg: # negative_entitys += ["PAD"] * (num_neg - len(negative_entitys)) sample_set[word] = [positive_entity, negative_entitys] # 产生字典 token_freq = defaultdict(int) token_freq["UNKNOWN"] = 1e8 #token_freq["PAD"] = 1e8-1 for k, (p, n) in sample_set.items(): tmp = [k, p] + n for t in tmp: if re_en.fullmatch(t): token_freq[t] += 1 else: for e in list(t): token_freq[e] += 1 sorted_token_freq = sorted(token_freq.items(), key=lambda d: d[1], reverse=True)[:VOCAB_SIZE] word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)} if conf.over_write_vocab: print("generate word2id file: %s" % (conf.vocab)) json.dump(word2id, open(conf.vocab, "w", encoding="utf8"), ensure_ascii=False, indent=2) _keys_ = list(sample_set.keys()) train_set = { k: sample_set[k] for k in _keys_[:int(len(_keys_) * conf.train_valid_ratio)] } valid_set = { k: sample_set[k] for k in _keys_[int(len(_keys_) * conf.train_valid_ratio):] } print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" % (len(sample_set), len(train_set), len(valid_set))) print("generate train sample file :%s\tvalid sample file: %s" % (conf.train_samples, conf.valid_samples)) json.dump(train_set, open(conf.train_samples, "w", encoding="utf8"), ensure_ascii=False, indent=2) json.dump(valid_set, open(conf.valid_samples, "w", encoding="utf8"), ensure_ascii=False, indent=2)
def parse(file_contents, filename='') -> "List of Property objects": """Returns list of Property objects parsed from given text""" open_properties = [Property(None, [])] for line_num, line in enumerate(file_contents, start=1): values = open_properties[-1].value freshline = utils.clean_line(line) if not freshline: # Skip blank lines! continue if freshline.startswith('"'): # data string line_contents = freshline.split('"') name = line_contents[1] if not utils.is_identifier(name): raise KeyValError( 'Invalid name ' + name + '!', filename, line_num, ) try: value = line_contents[3] if not freshline.endswith('"'): raise KeyValError( 'Key has value, but incomplete quotes!', filename, line_num, ) for orig, new in REPLACE_CHARS.items(): value = value.replace(orig, new) except IndexError: value = None values.append(Property(name, value)) # handle name bare on one line, will need a brace on # the next line elif utils.is_identifier(freshline): values.append(Property(freshline, [])) elif freshline.startswith('{'): if values[-1].value: raise KeyValError( 'Property cannot have sub-section if it already' 'has an in-line value.', filename, line_num, ) values[-1].value = [] open_properties.append(values[-1]) elif freshline.startswith('}'): open_properties.pop() else: raise KeyValError( "Unexpected beginning character '" + freshline[0] + '"!', filename, line_num, ) if not open_properties: raise KeyValError( 'Too many closing brackets.', filename, line_num, ) if len(open_properties) > 1: raise KeyValError( 'End of text reached with remaining open sections.', filename, line=None, ) return open_properties[0]
def parse(file_contents, filename='') -> "Property": """Returns a Property tree parsed from given text. filename, if set should be the source of the text for debug purposes. file_contents should be an iterable of strings """ open_properties = [Property(None, [])] for line_num, line in enumerate(file_contents, start=1): values = open_properties[-1].value freshline = utils.clean_line(line) if not freshline: # Skip blank lines! continue if freshline.startswith('"'): # data string line_contents = freshline.split('"') name = line_contents[1] if not utils.is_identifier(name): raise KeyValError( 'Invalid name ' + name + '!', filename, line_num, ) try: value = line_contents[3] if not freshline.endswith('"'): raise KeyValError( 'Key has value, but incomplete quotes!', filename, line_num, ) for orig, new in REPLACE_CHARS.items(): value = value.replace(orig, new) except IndexError: value = None values.append(Property(name, value)) elif freshline.startswith('{'): if values[-1].value: raise KeyValError( 'Property cannot have sub-section if it already' 'has an in-line value.', filename, line_num, ) values[-1].value = [] open_properties.append(values[-1]) elif freshline.startswith('}'): open_properties.pop() # handle name bare on one line, will need a brace on # the next line elif utils.is_identifier(freshline): values.append(Property(freshline, [])) else: raise KeyValError( "Unexpected beginning character '" + freshline[0] + '"!', filename, line_num, ) if not open_properties: raise KeyValError( 'Too many closing brackets.', filename, line_num, ) if len(open_properties) > 1: raise KeyValError( 'End of text reached with remaining open sections.', filename, line=None, ) return open_properties[0]