def main(): text1 = tknzr.tokenize_file("sample_input.txt") processed_text1 = preprocess_text(text1) v11 = build_vector(processed_text1) v12 = build_vector(processed_text1, type="bigram") print(v11) print(v12) text2 = tknzr.tokenize_file("sample_input2.txt") processed_text2 = preprocess_text(text2) v21 = build_vector(processed_text2) v22 = build_vector(processed_text2, type="bigram") print(v21) print(v22) print(ang.angle_between(v11, v21)) print(ang.angle_between(v12, v22))
def create_BOW(root_directory='./preprocessed_texts/'): """ :type root_directory: str """ training_path = os.path.join(root_directory, "training") training_bag_of_author = {} # super_counter = Counter() doc_count_of_author = {} authors = list_dirs(training_path) # total_doc_count = 0 for author in authors: bag = Counter() author_path = os.path.join(training_path, author) files_of_author = list_files(author_path) for filename in files_of_author: file_path = os.path.join(author_path, filename) tokens = tokenize_file(file_path) bag += Counter(tokens) training_bag_of_author[author] = bag doc_count = len(files_of_author) doc_count_of_author[author] = doc_count # total_doc_count += doc_count # super_counter += bag # print(super_counter.most_common(10)) return training_bag_of_author, doc_count_of_author
def replace_constants(source): """ For each line, if it is neccessary, it replaces a costant with a call to a new random function. :param source: File path :return: A list of lines """ #appplico ad ogni riga il tokenizzatore, quindi lines un vettore di stringhe lines = tokenizer.tokenize_file(source) #applico poi il cambio di variabile ad ogni riga lines = replace_constant_var_num(lines) #applico il cambio del nome delle variabili del while lines = replace_constant_while(lines) #applico il cambio del nome delle variabili del for lines = replace_constant_for(lines) pattern = 'import\s+\w+\s*' for index, line in enumerate(lines): if re.search(pattern, line) is None: break for block in new_def: lines.insert(index, block) index += 1 return lines
def replace_instructions(source): #Parte chiamata dal codice """ For each line, if it is neccessary, it replaces an instruction with a sequence of instructions. :param lines: Result from tokenizer.tokenize_file(...). :return: A list of lines. """ lines = tokenizer.tokenize_file(source) #nel file "tokenizer.py", read una line nel file source (in questo caso l'output.py) for index, line in enumerate(lines): line_tokenized = tokenizer.tokenize_line(line) #line_tokenized è una lista di stringhe line_to_replace = line # short to long pattern = '\w+[\+\-\*\/]=\w+' #uso di una ReGeX if re.search(pattern, line) is not None: line_to_replace = short_to_long(line_tokenized) # trasforma una line in cui vi è un'operazione dalla forma short # a quella long (esempio v+=1 --> v=v+1) line_tokenized = tokenizer.tokenize_line(line_to_replace) # ricerca del pattern corret pattern = match_pattern(line_to_replace) if pattern == 0: # var = var + var # ricerca della corretta operazione eseguita # sostituisco la riga in posizione (index) con for o while per fare lo stesso incremento della variabile # di fatto si va ad aumentare la lunghezza del programma come numero di righe e come tempi di esecuzione operators = get_operators(line_tokenized) if operators['op'] == '+' or operators['op'] == '-': lines[index] = generate_sum_sub_var_var_var(operators) elif operators['op'] == '*': lines[index] = generate_mult_var_var_var(operators) elif operators['op'] == '/': lines[index] = generate_div_var_var_var(operators) elif pattern == 1: # var = var + num # ricerca della corretta operazione eseguita operators = get_operators(line_tokenized) if operators['op'] == '+' or operators['op'] == '-': lines[index] = generate_sum_sub_var_var_num(operators) elif operators['op'] == '*': lines[index] = generate_mult_var_var_num(operators) elif operators['op'] == '/': lines[index] = generate_div_var_var_num(operators) elif pattern == 2: # var = num + var # ricerca della corretta operazione eseguita operators = get_operators(line_tokenized) if operators['op'] == '+' or operators['op'] == '-': lines[index] = generate_sum_sub_var_num_var(operators) elif operators['op'] == '*': lines[index] = generate_mult_var_num_var(operators) elif operators['op'] == '/': lines[index] = generate_div_var_num_var(operators) return lines
def replace_instructions(source): """ For each line, if it is neccessary, it replaces an instruction with a sequence of instructions. :param lines: Result from tokenizer.tokenize_file(...). :return: A list of lines. """ lines = tokenizer.tokenize_file(source) for index, line in enumerate(lines): line_tokenized = tokenizer.tokenize_line(line) line_to_replace = line # short to long pattern = '\w+[\+\-\*\/]=\w+' if re.search(pattern, line) is not None: line_to_replace = short_to_long(line_tokenized) line_tokenized = tokenizer.tokenize_line(line_to_replace) # match the correct pattern pattern = match_pattern(line_to_replace) if pattern == 0: # var = var + var # match the correct operation operators = get_operators(line_tokenized) if operators['op'] == '+' or operators['op'] == '-': lines[index] = generate_sum_sub_var_var_var(operators) elif operators['op'] == '*': lines[index] = generate_mult_var_var_var(operators) elif operators['op'] == '/': lines[index] = generate_div_var_var_var(operators) elif pattern == 1: # var = var + num # match the correct operation operators = get_operators(line_tokenized) if operators['op'] == '+' or operators['op'] == '-': lines[index] = generate_sum_sub_var_var_num(operators) elif operators['op'] == '*': lines[index] = generate_mult_var_var_num(operators) elif operators['op'] == '/': lines[index] = generate_div_var_var_num(operators) elif pattern == 2: # var = num + var # match the correct operation operators = get_operators(line_tokenized) if operators['op'] == '+' or operators['op'] == '-': lines[index] = generate_sum_sub_var_num_var(operators) elif operators['op'] == '*': lines[index] = generate_mult_var_num_var(operators) elif operators['op'] == '/': lines[index] = generate_div_var_num_var(operators) return lines
def get_info(filename): text = tknzr.tokenize_file(filename) cnt_sent = len(text) cnt_words = 0 cnt_symbols = 0 cnt_words_len = 0 for sentence in text: for token in sentence: cnt_symbols += len(token) if (not is_punct(token)): cnt_words += 1 cnt_words_len += len(token) print("{0}\nsentences: {1}\nwords: {2}\nsymbols: {3}\naverage words in sentence: {4}\naverage symbols in sentence: {5}\naverage word length: {6}\n".format(filename, cnt_sent, cnt_words, cnt_symbols, cnt_words / cnt_sent, cnt_symbols / cnt_sent, cnt_words_len / cnt_words)) return (cnt_sent, cnt_words, cnt_symbols, (cnt_words / cnt_sent), (cnt_symbols / cnt_sent), (cnt_words_len / cnt_words))
def obfuscate(source): """ Given the source code,it searchs for variables name and replaces them. :param source: Source file. :return: A list of lines. """ lines = tokenizer.tokenize_file(source) for ind, line in enumerate(lines): for pattern in pattern_search.values(): match = re.search(pattern, line) if match: search_variable_to_replace(line) lines = replace(lines) return (lines, replacement_dic)
def obfuscate(source): #prima funzione chiamata da pythonCowObfuscator() """ Given the source code,it searchs for variables name and replaces them. :param source: Source file. :return: A list of lines. """ lines = tokenizer.tokenize_file(source) #spezzo le varie line del codice sorgente, che sarà ovviamente il codice #che deriva dal passaggio precedente for ind, line in enumerate(lines): for pattern in pattern_search.values(): #scorro i pattern che desidero trovare con le ReGeX match = re.search(pattern, line) #faccio il match tra line e pattern (ovvero la ReGeX) if match: #se ho il match search_variable_to_replace(line) lines = replace(lines) #sostituisco le vecchie variabili con quelle nuove, lo fa per tutto il file, quindi per tutte le righe del file return (lines, replacement_dic)
def obfuscate(source,dictionary): #il file sorgente (del passaggio precedente) e il dizionario delle variabili da sostituire """ Given the source code and the variable dictionary,it searchs for function name and replaces them. :param source: Source file. :param dictionary: Variable dictionary. :return: A list of lines. """ lines = tokenizer.tokenize_file(source) #spezzo il file in lines for ind, line in enumerate(lines): #idicizzo le lines e le controllo tutte pattern_search = '\s*def\s*\w+\s*\(\w*' #ReGeX da cercare, per le def = funzioni match = re.search(pattern_search, line) #applico la ReGeX e cerco le def if match: #se ci sono dei match nella line di questa iterazione search_function_to_replace(line, dictionary) #chiamta di funzione per cambiare il nome della funzione in tale line lines = replace(lines) #faccio il cambio della line con quelle presenti nel dizionario return lines
def obfuscate(source,dictionary): """ Given the source code and the variable dictionary,it searchs for function name and replaces them. :param source: Source file. :param dictionary: Variable dictionary. :return: A list of lines. """ lines = tokenizer.tokenize_file(source) for ind, line in enumerate(lines): pattern_search = '\s*def\s*\w+\s*\(\w*' match = re.search(pattern_search, line) if match: search_function_to_replace(line, dictionary) lines = replace(lines) return lines
def main(): file_in = constants.DEFAULT_INPUT_FILE file_preprocessed = constants.DEFAULT_PREPROCESSED_FILE file_out = constants.DEFAULT_OUTPUT_FILE print(file_in) time.sleep(0.005) # If print to stderr, let stdout output first lines_of_tokens = tokenize_file(file_in) instructions = token_parser(lines_of_tokens) with open(file_preprocessed, "w") as f: for inst in instructions: assert isinstance(inst, Instruction) line = inst.opcode.text.ljust(5) + " " line += " ".join([str(_.text) for _ in inst.operands]) f.write(line + "\n") combinator_signals = inst_to_signals(instructions) # create blueprint bp = Blueprint() bp.generate_rom_entities(len(combinator_signals)) bp.insert_signals(combinator_signals) # export json_string = json.dumps(bp.json_dict) output = bp_encode_base64(bp_compress(json_string)) with open(file_out, "w") as f: f.write(output + "\n") # paste to clipboard, clip on Windows if platform.system() == "Windows": os.system("clip < " + file_out) print("Done. Blueprint string on clipboard.") else: print("Done. Blueprint string saved as " + file_out)
def replace_constants(source): """ For each line, if it is neccessary, it replaces a costant with a call to a new random function. :param source: File path :return: A list of lines """ lines = tokenizer.tokenize_file(source) lines = replace_constant_var_num(lines) lines = replace_constant_while(lines) lines = replace_constant_for(lines) pattern = 'import\s+\w+\s*' for index, line in enumerate(lines): if re.search(pattern, line) is None: break for block in new_def: lines.insert(index, block) index += 1 return lines
def calculate_confusion_matrix(training_bags, doc_counts, output_path='./preprocessed_texts/'): authors = list(training_bags.keys()) confusion_matrix = np.zeros([len(authors), len(authors)], dtype=np.integer) test_path = os.path.join(output_path, "test") for i, author in enumerate(authors): # bag = Counter() author_path = os.path.join(test_path, author) files_of_author = list_files(author_path) for filename in files_of_author: file_path = os.path.join(author_path, filename) tokens = tokenize_file(file_path) author_candidates = calculate_probability_of_author( tokens=tokens, training_bags=training_bags, doc_counts=doc_counts) candidate_index = authors.index(author_candidates[0][0]) confusion_matrix[i, candidate_index] += 1 # print(confusion) return confusion_matrix
stmt.append(parse_sub_block(stream, indent)) block.append(stmt) elif stream.ignore('keyword', string='elif'): stmt = AST('elif', []) stmt.append(parse_expression(stream)) stmt.append(parse_sub_block(stream, indent)) block.append(stmt) elif stream.ignore('keyword', string='else'): stmt = AST('else', []) stmt.append(parse_sub_block(stream, indent)) block.append(stmt) else: expr = parse_expression(stream) if has_sub_block(stream, indent): expr = AST('call', [expr]) expr.extend(parse_sub_block(stream, indent)) block.append(expr) return block if __name__=='__main__': path = 'input' stream = LookaheadStream(tokenize_file(path, symbols, keywords), Lexeme('eof')) root = parse_block(stream, 0) print root.repr() if not stream.can_advance('eof'): raise Exception("parsing halts") # for lexeme in # print lexeme.repr()
if token_idx >= len(tokens): return None, None, [ "Expected block end token, not end of document", f"Failed parsing block for {block_command_operator_token}", ] if not tokens[token_idx].is_block_end_token(): return None, None, [ f"Expected block end token, not: {tokens[token_idx]}", f"Failed parsing block for {block_command_operator_token}", ] token_idx += 1 block_command_node = ast.BlockCommandNode( block_command_operator_token, children ) return block_command_node, token_idx, None if __name__ == "__main__": import sys import tokenizer tokens = tokenizer.tokenize_file(sys.argv[1]) root_commands, error = consume_document(tokens) print(root_commands) print(error) import pdb pdb.set_trace()
def get_vector(filename, type="mixed"): text = tknzr.tokenize_file(filename) processed_text = preprocess_text(text) vector = build_vector(processed_text, type=type) return vector
def get_vector(filename): text = tknzr.tokenize_file(filename) processed_text = preprocess_text(text) vector = build_vector(processed_text) return vector
def get_freq(filename, threshold=1000): text = tknzr.tokenize_file(filename) processed_text = preprocess_text(text) res = count_freq(processed_text, fd, threshold) return res
def prepare_dataset(data_dir, tmp_dir, dataset_config, tokenize=True, merge_blanks=True): """ download, unzip and copy files to data_dir if necessary """ if not os.path.exists(data_dir): os.makedirs(data_dir) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # def download_dataset(): # url = dataset_config["url"] # filename = os.path.basename(url) # read_type = "r:gz" if "tgz" in filename else "r" # # compressed_file = maybe_download(tmp_dir, filename, url) # with tarfile.open(compressed_file, read_type) as corpus_tar: # logger.info("extracting %s to %s" % (compressed_file, tmp_dir)) # corpus_tar.extractall(tmp_dir) # def get_tmp_file(lang_file): # tmp_filepath = os.path.join(tmp_dir, lang_file) # if os.path.isfile(tmp_filepath): # logger.info("Found file: %s" % data_filepath) # else: # # download dataset, if it doesn't exist # download_dataset() # return tmp_filepath for _file in ["source", "target"]: _tmp = dataset_config[_file] _data = dataset_config["data_%s" % _file] # skip if data file exists. data_filepath = os.path.join(data_dir, _data) if os.path.isfile(data_filepath): logger.info("Found file: %s" % data_filepath) continue # get tmp file tmp_filepath = os.path.join(tmp_dir, _tmp) if not os.path.isfile(tmp_filepath): logger.info("tmp file: %s not found, downloading..." % tmp_filepath) # download_dataset() if tokenize: logger.info("tokenizing: %s" % tmp_filepath) tokenized = tokenizer.tokenize_file(tmp_filepath) logger.info("...done. writing to: %s" % data_filepath) f = open(data_filepath, 'w') f.write(tokenized) f.close() else: logger.info("tokenize=False, copying to %s" % data_filepath) os.rename(tmp_filepath, data_filepath) # merge blanks if merge_blanks: logger.info("\n%s\n%s" % ("=" * 30, "merging blanks...")) src = os.path.join(data_dir, dataset_config["data_source"]) targ = os.path.join(data_dir, dataset_config["data_target"]) merge_blanks_and_write(src, targ)