def main(): parser = _get_parser() opt = parser.parse_args() log_info(opt) preprocess(opt.dataset_path, opt.preprocess_mode) if opt.output_unit == 'character': generate_character_labels(opt.dataset_path, opt.labels_dest) generate_character_script(opt.dataset_path, opt.new_path, opt.script_prefix, opt.labels_dest) elif opt.output_unit == 'subword': generate_sentencepiece_input(opt.dataset_path) if not opt.use_pretrain_kobert_tokenizer: train_sentencepiece(opt.dataset_path, opt.vocab_size) generate_subword_labels('aihub_sentencepiece.vocab', opt.labels_dest, opt.use_pretrain_kobert_tokenizer) generate_subword_script(opt.dataset_path, opt.new_path, opt.script_prefix) elif opt.output_unit == 'grapheme': character_to_grapheme(opt.dataset_path, opt.grapheme_save_path) generate_grapheme_labels(opt.grapheme_save_path, opt.labels_dest) generate_grapheme_script(opt.grapheme_save_path, opt.new_path, opt.script_prefix, opt.labels_dest) else: raise ValueError("Unsupported preprocess method : {0}".format( opt.output_unit)) gather_files(opt.dataset_path, opt.new_path)
def get_children(children: []) -> [Node]: nodes = [] for child in children: nodes.append( Node(child["question"], child["typical_answer"], [[preprocess(x[0]), preprocess(x[1])] for x in child["solutions"]], get_children(child["children"]))) return nodes
def ask_if_it_helped(solution): print(solution.text) yes = preprocess.preprocess("yes") no = preprocess.preprocess("no") answer = preprocess.preprocess(input("Was it helpfull? ")) if yes.similarity(answer) - no.similarity(answer) > 0: print("You're welcome") return True else: return False
def __init__(self, data, answer, solutions=None, children=None): self.data = SpaCyTreeNode(preprocess.preprocess(data), preprocess.preprocess(answer)) self.parent = None self.children = [] self.level = 0 self.base_boarder = 0.75 self.base_boarder_full_search = 0.91 if solutions is not None: for solution in solutions: self.data.add_solution(solution[0], solution[1]) if children is not None: for child in children: self.addChild(child)
def convert_sen3(sen3_file): """ Convert sentinel3 data to reflectance Args: sen3_file: epsg: Returns: tuple with data bands as M x N np.arrays: ( S1_reflectance_an, S2_reflectance_an, S3_reflectance_an, S4_reflectance_an, S5_reflectance_an, S6_reflectance_an, S7_BT_in, S8_BT_in, S9_BT_in, ) Affine transform """ sen3_file = Path(sen3_file) cfg = conftools.load_directory(Path(__file__).parent / "config") cfg['workdir'] = sen3_file.parents[0] cfg['tmpdir'] = sen3_file.parents[0] ofile = preprocess(sen3_file, cfg, overwrite=False) data_channels, s3_transform = read_ofile(ofile) #Todo: remove tmp-file return data_channels, s3_transform
def get_fa_scores(self, df, doc_colname, save_path=None, tfidf=False, format="virtue_vice"): df = df.reset_index(drop=True) docs = df[doc_colname] print(f'Preprocessing column {doc_colname}') docs = preprocess(docs).reset_index(drop=True) baseline_docs = [ ] # todo docs.sample(frac=0.3, random_state=157).reset_index(drop=True) # todo build the w2v model print('Let\'s calculate bias and intensity') bias, intensity = self.doc_scores(docs=docs, baseline_docs=baseline_docs, tfidf=tfidf) print('total size: ', df.shape[0]) print('any NaN in bias?', np.isnan(bias.values).sum() ) # Nan means empty docs, we should remove them print('any NaN in intensity?', np.isnan(intensity.values).sum()) fa_scores = pd.concat([df, bias, intensity], axis=1) fa_scores = fa_scores.dropna(subset=bias.columns.tolist() + intensity.columns.tolist()).reset_index( drop=True) print('NAN scores dropped, new size:', fa_scores.shape[0]) if format == "virtue_vice": df_virtue_vice = [] for index, row in fa_scores.iterrows(): row_virtue_vice = {} for mf in self.axes.keys(): if row[f'bias_{mf}'] < 0: row_virtue_vice[f'{mf}.vice'] = row[f'intensity_{mf}'] row_virtue_vice[f'{mf}.virtue'] = 0 else: row_virtue_vice[f'{mf}.virtue'] = row[ f'intensity_{mf}'] row_virtue_vice[f'{mf}.vice'] = 0 df_virtue_vice.append(row_virtue_vice) df_virtue_vice = pd.DataFrame(df_virtue_vice) fa_scores = pd.concat([fa_scores, df_virtue_vice], axis=1) print('After addding vice-virtue scores, the shape:', fa_scores.shape) if save_path: if len(save_path.split('/')) > 1: output_dir = '/'.join(save_path.split('/')[:-1]) Path(output_dir).mkdir(parents=True, exist_ok=True) fa_scores.to_csv(save_path, index=None, header=True) print('Moral Foundations FrameAxis scores saved to {}'.format( save_path)) else: print('not saving the fa scores.') return fa_scores
def similarity_with_wrong_answer(self, user_input): self.you_are_wrong_answer = [ preprocess.preprocess(x) for x in [ "That is not what I meant", "You are wrong", "You are misunderstanding me", "You don't understand" ] ] return max([x.similarity(user_input) for x in self.you_are_wrong_answer])
def main(): parser = _get_parser() opt = parser.parse_args() log_info(opt) audio_paths, transcripts = preprocess(opt.dataset_path, opt.preprocess_mode) if opt.output_unit == 'character': generate_character_labels(transcripts, opt.vocab_dest) generate_character_script(audio_paths, transcripts, opt.vocab_dest) elif opt.output_unit == 'subword': train_sentencepiece(transcripts, opt.savepath, opt.vocab_size) sentence_to_subwords(audio_paths, transcripts, opt.savepath) elif opt.output_unit == 'grapheme': sentence_to_grapheme(audio_paths, transcripts, opt.vocab_dest) else: raise ValueError("Unsupported preprocess method : {0}".format(opt.output_unit))
def _build(difficulty: Union[int, bool], thread_count: int, is_silent: bool, build_files: Optional[List[str]] = None) -> bool: try: with open("meta/build.json", "r") as f: build_params = json.load(f) except: print("meta/build.json does not exist", file=sys.stderr) return False if difficulty is True: difficulty = build_params.get("release-opt", 9) elif difficulty is False: difficulty = build_params.get("debug-opt", 6) files_to_build: List[Dict[str, Any]] if build_files is None: files_to_build = build_params.get("files", []) else: files_to_build = [ file for file in build_params.get("files", []) if file["path"] in build_files ] file_sizes: Dict[str, int] = {} if not path.exists("build/"): mkdir("build/") for file in files_to_build: if "path" not in file: print("Malformed JSON", file=sys.stderr) return False # Step 0: Read file try: with open("src/" + file["path"], "r") as f: step_0 = f.read() except: print(f'File {file["path"]} does not exist', file=sys.stderr) return False if not is_silent: print("Processing file", file["path"], file=sys.stderr) step_0_size = len(step_0) # Step 1: Find chars step_1_vars = file.get("variables", []) step_1_aliases = file.get("aliases", {}) step_1_excluded = file.get("excluded", []) try: step_1_size, step_1_list, _ = find_chars(step_0, step_1_vars, step_1_aliases, step_1_excluded, difficulty, thread_count, silent=is_silent) except Exception as e: print("Step 1 failed: crash", file=sys.stderr) print(e, file=sys.stderr) return False if len(step_1_list) == 0: print("Step 1 failed: no combinations", file=sys.stderr) return False step_1 = step_0 for src, tgt in step_1_list[0]: step_1 = step_1.replace(src, tgt) if not is_silent: print("Step 1 complete", file=sys.stderr) # Step 2: Preprocess and template code if "template" in file: template = file["template"] else: template = "template.py" with open("preprocess/" + template, "rb") as f: step_2_template = f.read() step_2_compressed_code = preprocess(step_1, step_1_list[0]) step_2 = step_2_template.replace(b"{{code}}", step_2_compressed_code) step_2_size = len(step_2) if not is_silent: print("Step 2 complete", file=sys.stderr) # Note that the file is overwritten with open("build/" + file["path"], "wb") as f: f.write(step_2) file_sizes[file["path"]] = step_2_size if not is_silent: print(f'===', file=sys.stderr) print(f'File {file["path"]}:', file=sys.stderr) print(f' {step_0_size} => {step_1_size} characters', file=sys.stderr) print(f' Final size: {step_2_size}', file=sys.stderr) print(f'===', file=sys.stderr) if not is_silent: print("Cumulative size:", sum(file_sizes.values()), file=sys.stderr) return True
# -*- coding: utf-8 -*- from preprocess.preprocess import preprocess preprocess()
import os import time LOAD_DATA = False INDEX_NAME = 'qa' PREPROCESS = False GENERATE_VECT = True TOPN = 20 org_input_file = '../data/kuaixue_org.csv' input_file = '../data/kuaixue_p.csv' stop_words_file = '../data/stopwords.txt' my_vect_file = '../data/question.word2vec.bin' if __name__ == "__main__": if PREPROCESS: preprocess.preprocess(org_input_file, input_file) if GENERATE_VECT: word2vect.generate_model(input_file, stop_words_file, my_vect_file) if LOAD_DATA: if os.path.exists(input_file): es_mode = es_model.ES_Model(input_file, INDEX_NAME, stop_words_file, True) else: print("Input file is not exist") else: #es_mode = es_model.ES_Model(input_file, INDEX_NAME, stop_words_file ,False) pass
def run_preprocess(cfg_mysql, cfg_pipeline, cfg_course, cfg_mysql_script_path): print "###### Step 1: Pre-processing database" preprocess.preprocess(cfg_mysql, cfg_pipeline, cfg_course, cfg_mysql_script_path) print "Done"
def get_doc_from_input(message): user_input = input(message) return preprocess.preprocess(user_input)
import os import extraction.extract_micr as extract import preprocess.preprocess as p import numpy as np import imutils import skimage import argparse ap = argparse.ArgumentParser() ap.add_argument('--image', required=True, help='Absolute path of image') args = vars(ap.parse_args()) # directory = os.getcwd() # data_directory = directory + '/cheques/' # print(f'Images available: {os.listdir(data_directory)}') input_ = args['image'] image = input_ print(f'Extracting from: {image}') preprocessed_img = p.preprocess(image_path=image) extracted_micr, contour_img = extract.extract_micr(image=preprocessed_img) print(f'MICR Code: {extracted_micr}') cv2.imwrite('ocr_cheque.jpg', contour_img) cv2.imshow('Detected MICR code', cv2.resize(contour_img, (1000, 400))) cv2.waitKey(0) cv2.destroyAllWindows() print('Exiting...')
def up_to_july(professions): """ Runs normal, inter-year mobility analysis using only data for the first seven months of each year, i.e. up to July. :param professions: dict where key is profession name and value is base path to month-level data table :return: None """ for prof, path in professions.items(): # THIS IS FOR RUNNING ANALYSES USING ONLY MONTH DATA UP TO JULY # NB: for the sampler below to work properly (i.e. always try to sample July) you need to change the # value for "judges" in function preprocess.sample.get_sampling_month rom 4 to 7. Otherwise it tries to sample # April, for judges. No such issue for prosecutors. # for each year, sample by throwing out all observations occurring in months AFTER July with open(path, 'r') as infile: person_month_table = list(csv.reader(infile))[1:] sampled_years = [y for y in range(2006, 2021)] sampled_table = sample.mo_yr_sample(person_month_table, prof, [1, 2, 3, 4, 5, 6, 7], sampled_years) # write sampled table to disk samp_file_dir = root + 'conference_presentations/ecpr_2020/data/' + prof + '/' + 'sampled_collected/' with open(samp_file_dir + prof + '_to_july_sampled_month.csv', 'w') as out_file: writer = csv.writer(out_file) [writer.writerow(pm) for pm in sampled_table] # run preprocessor on sampled data prep_dir = root + 'conference_presentations/ecpr_2020/data/' + prof + '/' + 'preprocessed/' prep_file_path = prep_dir + prof + '_preprocessed.csv' std_log_path = prep_dir pids_log_path = prep_dir preprocess.preprocess(samp_file_dir, prep_file_path, std_log_path, pids_log_path, prof) # get descriptor tables using the preprocessed data descr_out_dir = root + 'conference_presentations/ecpr_2020/data/' + prof + '/' + 'descriptors/' with open(prep_file_path, 'r') as in_f: table = list(csv.reader(in_f))[1:] start_year, end_year = 2006, 2020 # make table of total counts per year describe.year_counts_table(table, start_year, end_year, prof, descr_out_dir) # make tables of total counts per year, per level in judicial hierarchy describe.year_counts_table(table, start_year, end_year, prof, descr_out_dir, unit_type='nivel') # make tables for entry and exit cohorts, per year, per gender, per level in judicial hierarchy describe.entry_exit_gender(table, start_year, end_year, prof, descr_out_dir, entry=False, unit_type='nivel') describe.entry_exit_gender(table, start_year, end_year, prof, descr_out_dir, entry=True, unit_type='nivel') # make table for mobility between appellate court regions describe.inter_unit_mobility_table(table, descr_out_dir, prof, 'ca cod') # make table for hierarchical mobility describe.hierarchical_mobility_table(table, descr_out_dir, prof) for unit_type in ['ca cod', 'nivel']: # make tables for entry and exit cohorts, per year per unit type describe.entry_exit_unit_table(table, start_year, end_year, prof, unit_type, descr_out_dir, entry=True) describe.entry_exit_unit_table(table, start_year, end_year, prof, unit_type, descr_out_dir, entry=False)
description='End-to-end Speech Recognition') parser.add_argument('--dataset_path', type=str, default='SET YOUR KsponSpeech corpus PATH') parser.add_argument( '--new_path', type=str, default='SET YOUR path to store preprocessed KsponSpeech corpus') parser.add_argument('--labels_dest', type=str, default='SET YOUT path th store aihub_labels.csv file') parser.add_argument('--script_prefix', type=str, default='KsponScript_', help='default: KsponScript_FILENUM.txt') parser.add_argument( '--mode', type=str, default='numeric', help='default: phonetic(6->"육"), optional: numeric(6->"6")') parser.add_argument('--filenum_adjust', action='store_true', default=False, help='adjust file number for handling "%"') opt = parser.parse_args() preprocess(opt.dataset_path, opt.new_path, opt.mode, opt.filenum_adjust) create_char_labels(opt.opt.new_path, opt.labels_dest) create_script(opt.dataset_path, opt.new_path, opt.script_prefix) #gather_files(opt.dataset_path, opt.new_path, opt.script_prefix)
if prof in {'judges', 'prosecutors'}: in_dir = root + trunks['dispersed'] + leaves[prof]['dispersed']['raw'] scrape_log = root + trunks['dispersed'] + leaves[prof]['dispersed']['scrape log'] scrape.update_db(in_dir, scrape_log, prof) # collect the data (which also does a first clean) in_dir = root + trunks['dispersed'] + leaves[prof]['dispersed']['raw'] out_path = root + trunks['collected'] + leaves[prof]['collected']['file'] make_table.make_pp_table(in_dir, out_path, prof) # preprocess the data (add variables, standardise names, assign unique IDs, etc.) in_dir = root + trunks['collected'] + leaves[prof]['collected']['dir'] pop_out_path = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['population'] std_log_path = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['standardise'] pids_log_path = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['pids'] preprocess.preprocess(in_dir, pop_out_path, std_log_path, pids_log_path, prof) # describe the data, i.e. generate tables of descriptive statistics for different samples pop_in_file = root + trunks['preprocessed'] + leaves[prof]['preprocessed']['population'] for sample in deets['samples']: # make directory tree for dumping the descriptives tables; NB: overwrites existing tree structure sample_out_dirs = {'totals': '', 'entry_exit': '', 'mobility': '', 'inheritance': ''} for d in sample_out_dirs: path_end = sample + '/' + d + '/' sample_out_dirs.update({d: root + trunks['descriptives'] + leaves[prof]['descriptives'] + path_end}) [Path(d).mkdir(parents=True, exist_ok=True) for d in sample_out_dirs.values()] # generate the descriptives tables describe.describe(pop_in_file, sample, sample_out_dirs['totals'], sample_out_dirs['entry_exit'], sample_out_dirs['mobility'], sample_out_dirs['inheritance'], prof, deets['range'][0], deets['range'][1], deets['units'])
# -*- coding: utf-8 -*- """ Created on Fri Jul 7 14:24:07 2017 Version 0.1 Finalized @author: Bahram """ # this code gets CSV file and gets tweets and labels to feed a NB classifier for computing parameters if classifier import pickle import csv import tokenizer from trainer import Trainer from operator import itemgetter from classifier import Classifier from preprocess.preprocess import preprocess # to get tokens, stem and remove stop words and negation detection. tweetTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ['?!#%&']))# process = preprocess() def processing (training): # preprocess sentences for stemming, removing stop words and negation detection and apply trainingProcessed = list(map(process.gettokens, training)) trainingProcessed = list(map(process.stemtokens, trainingProcessed)) trainingProcessed = list(map(process.removestopwords, trainingProcessed)) #print (trainingProcessed) trainingProcessed = list(map(process.negatesequence, trainingProcessed)) trainingProcessedWords = [] for sentence in trainingProcessed: sentence = ' '.join(sentence) wordsProcessed = sentence.split() for wordProcessed in wordsProcessed: trainingProcessedWords.append(wordProcessed) return trainingProcessedWords
""" @github{ title = {KsponSpeech.preprocess}, author = {Soohwan Kim}, publisher = {GitHub}, url = {https://github.com/sooftware/KsponSpeech.preprocess}, year = {2020} } """ import argparse from preprocess.preprocess import preprocess, create_char_labels, create_script if __name__ == '__main__': parser = argparse.ArgumentParser(description='End-to-end Speech Recognition') parser.add_argument('--dataset_path', type=str, default='SET YOUR KsponSpeech corpus PATH') parser.add_argument('--script_prefix', type=str, default='KsponScript_', help='default: KsponScript_FILENUM.txt') opt = parser.parse_args() preprocess(opt.dataset_path) create_char_labels(opt.dataset_path) create_script(opt.dataset_path, opt.script_prefix)