def run(model, filename=None, train_filename=None, predict_filename=None, line_function=None, train_line_function=None, predict_line_function=None, evaluate_function=None, split=0.8, header=True): if train_line_function is None and line_function is not None: train_line_function = line_function if predict_line_function is None and line_function is not None: predict_line_function = line_function if train_filename is None and filename is not None: train_filename = filename if predict_filename is None and filename is not None: predict_filename = filename num_cores = len(model) if isinstance(model, collections.Sequence) else 1 if num_cores > 1: os.system("spanning_tree") if header: num_lines = sum(1 for line in open(train_filename)) os.system('tail -n {} {} > {}'.format(num_lines - 1, train_filename, train_filename + '_')) if predict_filename != train_filename: num_lines = sum(1 for line in open(predict_filename)) os.system('tail -n {} {} > {}'.format(num_lines - 1, predict_filename, predict_filename + '_')) train_filename = train_filename + '_' predict_filename = predict_filename + '_' header = False split_file(train_filename, num_cores) if predict_filename != train_filename: split_file(predict_filename, num_cores) pool = Pool(num_cores) train_filenames = [train_filename + (str(n) if n >= 10 else '0' + str(n)) for n in range(num_cores)] predict_filenames = [predict_filename + (str(n) if n >= 10 else '0' + str(n)) for n in range(num_cores)] args = [] for i in range(num_cores): args.append({'model': model[i], 'train_filename': train_filenames[i], 'predict_filename': predict_filenames[i], 'train_line_function': train_line_function, 'predict_line_function': predict_line_function, 'evaluate_function': evaluate_function, 'split': split, 'quiet': model[i].params.get('quiet'), 'multicore': True, 'header': header}) results = sum(pool.map(run_model, args), []) if evaluate_function: print(evaluate_function(results)) for f in train_filenames + predict_filenames: safe_remove(f) os.system('killall spanning_tree') return results else: return run_(model, train_filename=train_filename, predict_filename=predict_filename, train_line_function=train_line_function, predict_line_function=predict_line_function, evaluate_function=evaluate_function, split=split, quiet=model.params.get('quiet'), multicore=False, header=header)
def read_bytes_as_segments(segment_byte_data: bytes) -> Segment: byte_file = BytesIO(segment_byte_data) segment_bytes = split_file(byte_file, 12) counted_segment_bytes = enumerate(segment_bytes) casted_counted_segment_bytes = (get_segment( i, current_byte) for i, current_byte in counted_segment_bytes) yield from casted_counted_segment_bytes
def index_worker(index, ipath): """Indexing. Parameters ---------- index : str Name of index. ipath : str Path to raw json files. """ opath = os.path.join('tmp.json') for chunk in split_file(ipath, 10000): with open(opath, 'w') as ofp: for doc in chunk: if 'title' not in doc or 'abstract' not in doc: global drop_count drop_count += 1 continue doc['title'] = doc['title'].lower().strip() doc['abstract'] = doc['abstract'].lower() doc['keywords'] = [e.lower().strip() for e in doc.get('keywords', [])], doc['fos'] = [e.lower().strip() for e in doc.get('fos', [])], json.dump({'index': {'_index': index}}, ofp) ofp.write('\n') json.dump(doc, ofp) ofp.write('\n') bulk_insert(index, opath) refresh(args.index) os.remove('tmp.json')
def get_file_index_info(header_data): with open("doom1.wad", 'rb') as wadfile: wadfile.seek(header_data.info_table_offset) file_chunks = split_file(wadfile, 16) enumerated_file_parts = enumerate(file_chunks) yield from ((position, get_index_entry(file_chunk)) for position, file_chunk in enumerated_file_parts)
def create_rhymescheme(self): """ Creates rhyme scheme list for lyrics and writes it to a file """ lyrics = split_file(self.lyrics_filename) schemes = [] for i, line in enumerate(lyrics): schemes.append(self.line_rhymescheme(line)) with open(self.scheme_filename, "w", encoding='utf-8') as f: f.write('\n'.join(schemes) + '\n')
def evaluate(ident, syllable_rhyme=True, generated=False): if generated: gen = '_generated' else: gen = '' if syllable_rhyme: syl = '.syl' else: syl = '' endings = split_file(f'schemes/{ident}{gen}{syl}.schemes') return rhyme_score(endings)
def run(self) -> List[subprocess.CompletedProcess]: # Record file information to the database: self.record_file() # Split file into smaller files for parallel transfer: split_files = split_file(file=self.file, file_split_size=self.file_split_size, file_split_chunk=self.cores) # Spawn a pool of workers to process transfer: with Pool(self.cores) as pool: results = pool.map(self.run_sub_experiment, split_files) return results
def average_syllables(target_list): """ Counts average number of syllables in lyrics for a specified artists and/or genres. :param target_list: list of identifiers (artist and/or genres), str :return: dictionary {identifier: average number of syllables}, dict """ syllables = {} for ident in target_list: original_bars = split_file(f'data/{ident}.txt') count = 0 excluded = 0 for line in original_bars: syls = count_syllables(line) if syls > 3: count += syls else: excluded += 1 syllables[ident] = count / (len(original_bars) - excluded) return syllables
def __init__(self, identifier, params, syllable_rhyme=False): """ Initialization of LyricsGenerator class providing methods for generation of lyrics based on Markov chains and RNN: it generates markov sequences based on word1->word2 transition probabilities (markovify library) and uses an LSTM network to pick the most suitable sequence (LyricsNN class). :param identifier: current artist or genre, str :param params: contains depth, max_syllables, max_overlap, num_lines parameters, dict :param syllable_rhyme: True if we use morpheme based rhyme, False if we use 2-last-letters based rhyme, bool """ self.params = params self.identifier = identifier if syllable_rhyme: self.rhymer = RhymerSyl(identifier) self.path_modifier = '_syl' else: self.rhymer = RhymerEnd(identifier) self.path_modifier = '' self.training_file = f"data/{identifier}.txt" self.lyrics_model = LyricsNN(self.params['depth'], identifier) self.markov_model = create_markov_model(self.training_file) self.original_bars = split_file(self.training_file)
def main(): # if len(sys.argv) != 5: # logging.info('please input args: car_path, road_path, cross_path, answerPath') # exit(1) # car_path = sys.argv[1] # road_path = sys.argv[2] # cross_path = sys.argv[3] # answer_path = sys.argv[4] #car_path = '/Users/ch_cmpter/Desktop/car_test.txt' car_path = '../config_5/car.txt' road_path = '../config_5/road.txt' cross_path = '../config_5/cross.txt' answer_path = '../config_5/answer.txt' # logging.info("car_path is %s" % (car_path)) # logging.info("road_path is %s" % (road_path)) # logging.info("cross_path is %s" % (cross_path)) # logging.info("answer_path is %s" % (answer_path)) car_paths = split_file(car_path, 8000) base_answer_dir = split(answer_path)[0] answer_paths = [ join(base_answer_dir, 'answer_{}.txt'.format(str(ix))) for ix, _ in enumerate(car_paths) ] pool = multiprocessing.Pool() results = [] for ix, path in zip(car_paths, answer_paths): result = pool.apply_async(driver, args=(road_path, path[0], cross_path, path[1])) results.append(result) pool.close() pool.join() for r in results: print(r.get())
def generating_phase(self): """ Generation phase: consecutive creation and filtering of markov sequences and vectors and converting these vectors into lyrics. :return: generated lyrics, str """ markov_bars = self.generate_lyrics() if os.path.exists(self.rhymer.rhyme_filename): rhyme_list = split_file(self.rhymer.rhyme_filename) else: print("Rhyme list was not created, please train the model first.") return vectors = self.create_vectors(rhyme_list) lyrics = self.vectors_into_lyrics(vectors, markov_bars, rhyme_list) f = open( f"generated_lyrics/{self.identifier}{self.path_modifier}_generated.txt", "w", encoding='utf-8') lyrics_str = '' for bar in lyrics: f.write(bar) f.write("\n") lyrics_str += bar + '\n' return lyrics_str
def read_bytes_as_lines(line_byte_data: bytes) -> Line: byte_file = BytesIO(line_byte_data) line_bytes = split_file(byte_file, 14) counted_line_bytes = enumerate(line_bytes) casted_counted_line_bytes = (get_line(i, current_byte) for i, current_byte in counted_line_bytes) yield from casted_counted_line_bytes