Python tokenize_fileの例、tokenizer.tokenize_file Pythonの例

コード例 #1

0

ファイルを表示

def main():
    text1 = tknzr.tokenize_file("sample_input.txt")
    processed_text1 = preprocess_text(text1)
    v11 = build_vector(processed_text1)
    v12 = build_vector(processed_text1, type="bigram")
    print(v11)
    print(v12)
    text2 = tknzr.tokenize_file("sample_input2.txt")
    processed_text2 = preprocess_text(text2)
    v21 = build_vector(processed_text2)
    v22 = build_vector(processed_text2, type="bigram")
    print(v21)
    print(v22)
    print(ang.angle_between(v11, v21))
    print(ang.angle_between(v12, v22))

コード例 #2

0

ファイルを表示

ファイル: author_identification.py プロジェクト: hanefi/NLP-Projects-CMPE561

def create_BOW(root_directory='./preprocessed_texts/'):
    """
    :type root_directory: str
    """
    training_path = os.path.join(root_directory, "training")

    training_bag_of_author = {}
    # super_counter = Counter()
    doc_count_of_author = {}

    authors = list_dirs(training_path)
    # total_doc_count = 0

    for author in authors:
        bag = Counter()

        author_path = os.path.join(training_path, author)
        files_of_author = list_files(author_path)

        for filename in files_of_author:
            file_path = os.path.join(author_path, filename)
            tokens = tokenize_file(file_path)
            bag += Counter(tokens)

        training_bag_of_author[author] = bag
        doc_count = len(files_of_author)
        doc_count_of_author[author] = doc_count
        # total_doc_count += doc_count

        # super_counter += bag

    # print(super_counter.most_common(10))
    return training_bag_of_author, doc_count_of_author

コード例 #3

0

ファイルを表示

def replace_constants(source):
    """
    For each line, if it is neccessary, it replaces a costant with a call to a new random function.

    :param source: File path
    :return: A list of lines
    """
    #appplico ad ogni riga il tokenizzatore, quindi lines un vettore di stringhe
    lines = tokenizer.tokenize_file(source)
    #applico poi il cambio di variabile ad ogni riga
    lines = replace_constant_var_num(lines)
    #applico il cambio del nome delle variabili del while
    lines = replace_constant_while(lines)
    #applico il cambio del nome delle variabili del for
    lines = replace_constant_for(lines)

    pattern = 'import\s+\w+\s*'
    for index, line in enumerate(lines):
        if re.search(pattern, line) is None:
            break

    for block in new_def:
        lines.insert(index, block)
        index += 1

    return lines

コード例 #4

0

ファイルを表示

ファイル: generate_equivalent_instructions_sequence.py プロジェクト: FrancescoGobbi/Software_Security_Project

def replace_instructions(source): #Parte chiamata dal codice
    """
    For each line, if it is neccessary, it replaces an instruction with a sequence of instructions.

    :param lines: Result from tokenizer.tokenize_file(...).
    :return: A list of lines.
    """
    lines = tokenizer.tokenize_file(source) #nel file "tokenizer.py", read una line nel file source (in questo caso l'output.py)
    for index, line in enumerate(lines):
        line_tokenized = tokenizer.tokenize_line(line) #line_tokenized è una lista di stringhe
        line_to_replace = line

        # short to long
        pattern = '\w+[\+\-\*\/]=\w+' #uso di una ReGeX
        if re.search(pattern, line) is not None:
            line_to_replace = short_to_long(line_tokenized) # trasforma una line in cui vi è un'operazione dalla forma short
            # a quella long (esempio v+=1 --> v=v+1)
            line_tokenized = tokenizer.tokenize_line(line_to_replace)

        # ricerca del pattern corret
        pattern = match_pattern(line_to_replace)
        if pattern == 0:
            # var = var + var
            # ricerca della corretta operazione eseguita
            # sostituisco la riga in posizione (index) con for o while per fare lo stesso incremento della variabile
            # di fatto si va ad aumentare la lunghezza del programma come numero di righe e come tempi di esecuzione
            operators = get_operators(line_tokenized)
            if operators['op'] == '+' or operators['op'] == '-':
                lines[index] = generate_sum_sub_var_var_var(operators)
            elif operators['op'] == '*':
                lines[index] = generate_mult_var_var_var(operators)
            elif operators['op'] == '/':
                lines[index] = generate_div_var_var_var(operators)
        elif pattern == 1:
            # var = var + num
            # ricerca della corretta operazione eseguita
            operators = get_operators(line_tokenized)
            if operators['op'] == '+' or operators['op'] == '-':
                lines[index] = generate_sum_sub_var_var_num(operators)
            elif operators['op'] == '*':
                lines[index] = generate_mult_var_var_num(operators)
            elif operators['op'] == '/':
                lines[index] = generate_div_var_var_num(operators)
        elif pattern == 2:
            # var = num + var
            # ricerca della corretta operazione eseguita
            operators = get_operators(line_tokenized)
            if operators['op'] == '+' or operators['op'] == '-':
                lines[index] = generate_sum_sub_var_num_var(operators)
            elif operators['op'] == '*':
                lines[index] = generate_mult_var_num_var(operators)
            elif operators['op'] == '/':
                lines[index] = generate_div_var_num_var(operators)
    return lines

コード例 #5

0

ファイルを表示

ファイル: generate_equivalent_instructions_sequence.py プロジェクト: nzenari/PythonCowObfuscator

def replace_instructions(source):
    """
    For each line, if it is neccessary, it replaces an instruction with a sequence of instructions.

    :param lines: Result from tokenizer.tokenize_file(...).
    :return: A list of lines.
    """
    lines = tokenizer.tokenize_file(source)
    for index, line in enumerate(lines):
        line_tokenized = tokenizer.tokenize_line(line)
        line_to_replace = line

        # short to long
        pattern = '\w+[\+\-\*\/]=\w+'
        if re.search(pattern, line) is not None:
            line_to_replace = short_to_long(line_tokenized)
            line_tokenized = tokenizer.tokenize_line(line_to_replace)

        # match the correct pattern
        pattern = match_pattern(line_to_replace)
        if pattern == 0:
            # var = var + var
            # match the correct operation
            operators = get_operators(line_tokenized)
            if operators['op'] == '+' or operators['op'] == '-':
                lines[index] = generate_sum_sub_var_var_var(operators)
            elif operators['op'] == '*':
                lines[index] = generate_mult_var_var_var(operators)
            elif operators['op'] == '/':
                lines[index] = generate_div_var_var_var(operators)
        elif pattern == 1:
            # var = var + num
            # match the correct operation
            operators = get_operators(line_tokenized)
            if operators['op'] == '+' or operators['op'] == '-':
                lines[index] = generate_sum_sub_var_var_num(operators)
            elif operators['op'] == '*':
                lines[index] = generate_mult_var_var_num(operators)
            elif operators['op'] == '/':
                lines[index] = generate_div_var_var_num(operators)
        elif pattern == 2:
            # var = num + var
            # match the correct operation
            operators = get_operators(line_tokenized)
            if operators['op'] == '+' or operators['op'] == '-':
                lines[index] = generate_sum_sub_var_num_var(operators)
            elif operators['op'] == '*':
                lines[index] = generate_mult_var_num_var(operators)
            elif operators['op'] == '/':
                lines[index] = generate_div_var_num_var(operators)
    return lines

コード例 #6

0

ファイルを表示

ファイル: text_info.py プロジェクト: vladislavneon/style-based-plagiarism-detection

def get_info(filename):
    text = tknzr.tokenize_file(filename)
    cnt_sent = len(text)
    cnt_words = 0
    cnt_symbols = 0
    cnt_words_len = 0
    for sentence in text:
        for token in sentence:
            cnt_symbols += len(token)
            if (not is_punct(token)):
                cnt_words += 1
                cnt_words_len += len(token)
    print("{0}\nsentences: {1}\nwords: {2}\nsymbols: {3}\naverage words in sentence: {4}\naverage symbols in sentence: {5}\naverage word length: {6}\n".format(filename, cnt_sent, cnt_words, cnt_symbols, cnt_words / cnt_sent, cnt_symbols / cnt_sent, cnt_words_len / cnt_words))
    return (cnt_sent, cnt_words, cnt_symbols, (cnt_words / cnt_sent), (cnt_symbols / cnt_sent), (cnt_words_len / cnt_words))

コード例 #7

0

ファイルを表示

def obfuscate(source):
    """
    Given the source code,it searchs for variables name and replaces them.

    :param source: Source file.
    :return: A list of lines.
    """
    lines = tokenizer.tokenize_file(source)
    for ind, line in enumerate(lines):
        for pattern in pattern_search.values():
            match = re.search(pattern, line)
            if match:
                search_variable_to_replace(line)
    lines = replace(lines)
    return (lines, replacement_dic)

コード例 #8

0

ファイルを表示

ファイル: obfuscate_variable.py プロジェクト: FrancescoGobbi/Software_Security_Project

def obfuscate(source): #prima funzione chiamata da pythonCowObfuscator()
    """
    Given the source code,it searchs for variables name and replaces them.

    :param source: Source file.
    :return: A list of lines.
    """
    lines = tokenizer.tokenize_file(source) #spezzo le varie line del codice sorgente, che sarà ovviamente il codice
    #che deriva dal passaggio precedente
    for ind, line in enumerate(lines):
        for pattern in pattern_search.values(): #scorro i pattern che desidero trovare con le ReGeX
            match = re.search(pattern, line) #faccio il match tra line e pattern (ovvero la ReGeX)
            if match: #se ho il match
                search_variable_to_replace(line) 
    lines = replace(lines) #sostituisco le vecchie variabili con quelle nuove, lo fa per tutto il file, quindi per tutte le righe del file
    return (lines, replacement_dic)

コード例 #9

0

ファイルを表示

def obfuscate(source,dictionary): #il file sorgente (del passaggio precedente) e il dizionario delle variabili da sostituire
    """
    Given the source code and the variable dictionary,it searchs for function name and replaces them.

    :param source: Source file.
    :param dictionary: Variable dictionary.
    :return: A list of lines.
    """
    lines = tokenizer.tokenize_file(source) #spezzo il file in lines
    for ind, line in enumerate(lines): #idicizzo le lines e le controllo tutte
        pattern_search = '\s*def\s*\w+\s*\(\w*' #ReGeX da cercare, per le def = funzioni
        match = re.search(pattern_search, line) #applico la ReGeX e cerco le def
        if match: #se ci sono dei match nella line di questa iterazione
            search_function_to_replace(line, dictionary) #chiamta di funzione per cambiare il nome della funzione in tale line
    lines = replace(lines) #faccio il cambio della line con quelle presenti nel dizionario

    return lines

コード例 #10

0

ファイルを表示

def obfuscate(source,dictionary):
    """
    Given the source code and the variable dictionary,it searchs for function name and replaces them.

    :param source: Source file.
    :param dictionary: Variable dictionary.
    :return: A list of lines.
    """
    lines = tokenizer.tokenize_file(source)
    for ind, line in enumerate(lines):
        pattern_search = '\s*def\s*\w+\s*\(\w*'
        match = re.search(pattern_search, line)
        if match:
            search_function_to_replace(line, dictionary)
    lines = replace(lines)

    return lines

コード例 #11

0

ファイルを表示

def main():
    file_in = constants.DEFAULT_INPUT_FILE
    file_preprocessed = constants.DEFAULT_PREPROCESSED_FILE
    file_out = constants.DEFAULT_OUTPUT_FILE

    print(file_in)
    time.sleep(0.005)  # If print to stderr, let stdout output first

    lines_of_tokens = tokenize_file(file_in)

    instructions = token_parser(lines_of_tokens)

    with open(file_preprocessed, "w") as f:
        for inst in instructions:
            assert isinstance(inst, Instruction)
            line = inst.opcode.text.ljust(5) + " "
            line += " ".join([str(_.text) for _ in inst.operands])
            f.write(line + "\n")

    combinator_signals = inst_to_signals(instructions)

    # create blueprint

    bp = Blueprint()
    bp.generate_rom_entities(len(combinator_signals))
    bp.insert_signals(combinator_signals)

    # export
    json_string = json.dumps(bp.json_dict)
    output = bp_encode_base64(bp_compress(json_string))

    with open(file_out, "w") as f:
        f.write(output + "\n")

    # paste to clipboard, clip on Windows
    if platform.system() == "Windows":
        os.system("clip < " + file_out)
        print("Done. Blueprint string on clipboard.")
    else:
        print("Done. Blueprint string saved as " + file_out)

コード例 #12

0

ファイルを表示

def replace_constants(source):
    """
    For each line, if it is neccessary, it replaces a costant with a call to a new random function.

    :param source: File path
    :return: A list of lines
    """
    lines = tokenizer.tokenize_file(source)
    lines = replace_constant_var_num(lines)
    lines = replace_constant_while(lines)
    lines = replace_constant_for(lines)

    pattern = 'import\s+\w+\s*'
    for index, line in enumerate(lines):
        if re.search(pattern, line) is None:
            break

    for block in new_def:
        lines.insert(index, block)
        index += 1

    return lines

コード例 #13

0

ファイルを表示

ファイル: author_identification.py プロジェクト: hanefi/NLP-Projects-CMPE561

def calculate_confusion_matrix(training_bags, doc_counts,
                               output_path='./preprocessed_texts/'):
    authors = list(training_bags.keys())
    confusion_matrix = np.zeros([len(authors), len(authors)], dtype=np.integer)

    test_path = os.path.join(output_path, "test")

    for i, author in enumerate(authors):
        # bag = Counter()
        author_path = os.path.join(test_path, author)
        files_of_author = list_files(author_path)

        for filename in files_of_author:
            file_path = os.path.join(author_path, filename)
            tokens = tokenize_file(file_path)
            author_candidates = calculate_probability_of_author(
                tokens=tokens,
                training_bags=training_bags,
                doc_counts=doc_counts)
            candidate_index = authors.index(author_candidates[0][0])
            confusion_matrix[i, candidate_index] += 1
    # print(confusion)
    return confusion_matrix

コード例 #14

0

ファイルを表示

ファイル: main.py プロジェクト: cheery/20131020-parser

            stmt.append(parse_sub_block(stream, indent))
            block.append(stmt)
        elif stream.ignore('keyword', string='elif'):
            stmt = AST('elif', [])
            stmt.append(parse_expression(stream))
            stmt.append(parse_sub_block(stream, indent))
            block.append(stmt)
        elif stream.ignore('keyword', string='else'):
            stmt = AST('else', [])
            stmt.append(parse_sub_block(stream, indent))
            block.append(stmt)
        else:
            expr = parse_expression(stream)
            if has_sub_block(stream, indent):
                expr = AST('call', [expr])
                expr.extend(parse_sub_block(stream, indent))
            block.append(expr)
    return block

if __name__=='__main__':
    path = 'input'
    stream = LookaheadStream(tokenize_file(path, symbols, keywords), Lexeme('eof'))
    root = parse_block(stream, 0)
    print root.repr()
    if not stream.can_advance('eof'):
        raise Exception("parsing halts")


#    for lexeme in 
#        print lexeme.repr()

コード例 #15

0

ファイルを表示

  if token_idx >= len(tokens):
    return None, None, [
      "Expected block end token, not end of document",
      f"Failed parsing block for {block_command_operator_token}",
    ]
  if not tokens[token_idx].is_block_end_token():
    return None, None, [
      f"Expected block end token, not: {tokens[token_idx]}",
      f"Failed parsing block for {block_command_operator_token}",
    ]
  token_idx += 1

  block_command_node = ast.BlockCommandNode(
    block_command_operator_token, children
  )

  return block_command_node, token_idx, None

if __name__ == "__main__":
  import sys
  import tokenizer

  tokens = tokenizer.tokenize_file(sys.argv[1])
  root_commands, error = consume_document(tokens)
  print(root_commands)
  print(error)

  import pdb
  pdb.set_trace()

コード例 #16

0

ファイルを表示

def get_vector(filename, type="mixed"):
    text = tknzr.tokenize_file(filename)
    processed_text = preprocess_text(text)
    vector = build_vector(processed_text, type=type)
    return vector

コード例 #17

0

ファイルを表示

def get_vector(filename):
    text = tknzr.tokenize_file(filename)
    processed_text = preprocess_text(text)
    vector = build_vector(processed_text)
    return vector

コード例 #18

0

ファイルを表示

ファイル: freq_dictionary.py プロジェクト: vladislavneon/style-based-plagiarism-detection

def get_freq(filename, threshold=1000):
    text = tknzr.tokenize_file(filename)
    processed_text = preprocess_text(text)
    res = count_freq(processed_text, fd, threshold)
    return res

コード例 #19

0

ファイルを表示

def prepare_dataset(data_dir,
                    tmp_dir,
                    dataset_config,
                    tokenize=True,
                    merge_blanks=True):
    """ download, unzip and copy files to data_dir if necessary """

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # def download_dataset():
    #     url = dataset_config["url"]
    #     filename = os.path.basename(url)
    #     read_type = "r:gz" if "tgz" in filename else "r"
    #
    #     compressed_file = maybe_download(tmp_dir, filename, url)
    #     with tarfile.open(compressed_file, read_type) as corpus_tar:
    #         logger.info("extracting %s to %s" % (compressed_file, tmp_dir))
    #         corpus_tar.extractall(tmp_dir)

    # def get_tmp_file(lang_file):
    #     tmp_filepath = os.path.join(tmp_dir, lang_file)
    #     if os.path.isfile(tmp_filepath):
    #         logger.info("Found file: %s" % data_filepath)
    #     else:
    #         # download dataset, if it doesn't exist
    #         download_dataset()
    #     return tmp_filepath

    for _file in ["source", "target"]:
        _tmp = dataset_config[_file]
        _data = dataset_config["data_%s" % _file]

        # skip if data file exists.
        data_filepath = os.path.join(data_dir, _data)
        if os.path.isfile(data_filepath):
            logger.info("Found file: %s" % data_filepath)
            continue

        # get tmp file
        tmp_filepath = os.path.join(tmp_dir, _tmp)
        if not os.path.isfile(tmp_filepath):
            logger.info("tmp file: %s not found, downloading..." %
                        tmp_filepath)
            # download_dataset()

        if tokenize:
            logger.info("tokenizing: %s" % tmp_filepath)
            tokenized = tokenizer.tokenize_file(tmp_filepath)
            logger.info("...done. writing to: %s" % data_filepath)
            f = open(data_filepath, 'w')
            f.write(tokenized)
            f.close()
        else:
            logger.info("tokenize=False, copying to %s" % data_filepath)
            os.rename(tmp_filepath, data_filepath)

    # merge blanks
    if merge_blanks:
        logger.info("\n%s\n%s" % ("=" * 30, "merging blanks..."))
        src = os.path.join(data_dir, dataset_config["data_source"])
        targ = os.path.join(data_dir, dataset_config["data_target"])
        merge_blanks_and_write(src, targ)