Python Tokenizer.ComplexTokenizer 예제들

프로그래밍 언어: Python

클래스/타입: Tokenizer

메소드/함수: ComplexTokenizer

hotexamples.com에서의 예제들: 3

Python Tokenizer.ComplexTokenizer - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 Tokenizer.ComplexTokenizer 패키지로부터 fastai에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Tokenizer(30)

PeakToken(7)

Consume(5)

SimpleTokenizer(5)

peepahead(4)

BetterTokenizer(3)

ComplexTokenizer(3)

Tokenize(3)

HuTokenizer(2)

getNextToken(2)

queueData(2)

PasTokenizer(1)

LemmaTokenizer(1)

TokenCategorizer(1)

JanomeTokenizer(1)

Initialize(1)

advanceToken(1)

eval_tokenizer(1)

has_next(1)

lemmatizer(1)

stemmer(1)

tokenizeAdvanced(1)

예제 #1

파일 보기

파일: CreateIndex.py 프로젝트: joao-alegria/RI

def main(argv):
    """
    Main script for the discipline's assignments 1 and 2. This script is responsable for calling the correct classes and for creating the data flow necessary for the index to be created and persisted.

    :param argv: receives the arguments passed to the program during execution
    :type argv: list<str>

    """

    HELP = """USAGE:\n
    python3 CreateIndex.py [-h] [-p] [-w] [-o outputFolder] [-l limit] [-t tokenizer] [-r limitRAM] inputFolder\n
        OPTIONS:
           h - shows this help
           o - define output file's folder
           l - define limit for the number of lines to be processed in each input file
           t - define the tokenizer used for the program
           r - limit program execution to defined RAM capacity
           w - process weights of terms
           p - process positions of terms
        ARGUMENTS:
           outputFolder - actual name for the output folder
           limit - value for the number of lines limit
           tokenizer - must be simple(for the simple 2.1 tokenizer) or complex(for the more advanced 2.2 tokenizer)
           limitRAM - maximum RAM(in Gb) used in the indexing process
           inputFolder - name of the folder that contains the input files to be processed"""

    # default variables
    outputFolder = "index"
    limit = None
    tokenizer = "simple"
    maximumRAM = None
    weightCalc = False
    positionCalc = False
    fileLimit = float("inf")

    try:
        opts, args = getopt.getopt(argv, "wpho:t:l:r:f:")
    except getopt.GetoptError:
        print(HELP)
        return 1

    if args == [] or len(args) != 1:
        print(HELP)
        return 2

    # verifies if any option was passed to the script
    for opt, arg in opts:
        if opt == '-h':
            print(HELP)
            return 3
        elif opt == "-o":
            outputFolder = arg
        elif opt == "-l":
            limit = int(arg)
        elif opt == "-f":
            fileLimit = float(arg) * 1000000000
        elif opt == "-t":
            assert arg in (
                "simple", "complex"
            ), "Tokenizer option must be either \"simple\" or \"complex\"."
            tokenizer = arg
        elif opt == "-w":
            weightCalc = True
        elif opt == "-p":
            positionCalc = True
        elif opt == "-r":
            maxM = psutil.virtual_memory().free
            if arg != "":
                maximumRAM = float(arg) * 1000000000
            else:
                maximumRAM = maxM
            if maximumRAM > maxM:
                maximumRAM = maxM
                print(
                    "Warning: Memory available is less than the asked value, maximumRAM set to "
                    + str(int(maximumRAM / 1000000000)) + "Gb.")

    # taking in account the choosen tokenizer, the respective data flow is created
    if tokenizer == "simple":
        if maximumRAM is None:
            assignment1(Tokenizer.SimpleTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, fileLimit)
        else:
            assignment2(Tokenizer.SimpleTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, maximumRAM, fileLimit)

    else:  # 'complex' = default tokenizer
        if maximumRAM is None:
            assignment1(Tokenizer.ComplexTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, fileLimit)
        else:
            assignment2(Tokenizer.ComplexTokenizer(), outputFolder, args[0],
                        limit, weightCalc, positionCalc, maximumRAM, fileLimit)

    return 0

예제 #2

파일 보기

def main(argv):
    """
    Main script for the discipline's assignment 3. This script is responsable for calling the correct classes and for creating the data flow necessary for querying an existing index.

    :param argv: receives the arguments passed to the program during execution
    :type argv: list<str>

    """

    HELP = """USAGE:\n
    python3 QueryIndex.py [-h] [-o outputFile] [-t tokenizer] [-r limitRAM] [-f feedback] [-s rocchioScope] [-c numChamps] [-l limit] <queryFile> <indexFolder> [a b g] \n
        OPTIONS:
           h - shows this help
           f - tells 
           o - define output file's name
           t - define the tokenizer used for the program
           r - limit program execution to defined RAM capacity
           f - define the feedback used for the Rocchio algorithm
           s - define the number of retrieved documents considered for the Rocchio algorithm
           c - define the size of the champions list
           l - define the number of scores to return
        ARGUMENTS:
           outputFile - actual name for the output file
           tokenizer - must be 'simple' or 'complex' 
           limitRAM - maximum RAM(in Gb) used in the indexing process
           queryFile - name of the file containing 1 or more queries
           indexFolder - name of the folder that contains the indexes
           a - alpha weight for the Rocchio algorithm
           b - beta weight for the Rocchio algorithm
           g - gamma weight for the Rocchio algorithm
           feedback - must be 'user' or 'pseudo'
           rocchioScope - number of retrieved documents considered for the Rocchio algorithm
           numChamps - size of the champions list
           limit - limit number of scores to return"""

    # default variables
    outputFile = "../queryResults/"
    tokenizer = "complex"
    maximumRAM = None
    feedback = None  # None, pseudo or user
    rocchioWeights = []  # alpha, beta and gamma
    n = None  # number of relevant docs (for feedback)
    k = 10000  # champions list size
    limit = 100  # number of scores

    try:
        opts, args = getopt.getopt(argv, "ho:t:r:f:c:s:l:")
    except getopt.GetoptError:
        print(HELP)
        return 1

    if args == [] or (len(args) != 2 and len(args) != 4 and len(args) != 5):
        print(HELP)
        return 2

    # verifies if any option was passed to the script
    for opt, arg in opts:
        if opt == '-h':
            print(HELP)
            return 3
        elif opt == "-o":
            outputFile = arg
        elif opt == "-t":
            assert arg in (
                "simple", "complex"
            ), "Tokenizer option must be either \"simple\" or \"complex\"."
            tokenizer = arg
        elif opt == "-r":
            maxM = psutil.virtual_memory().free
            if arg != "":
                maximumRAM = float(arg) * 1000000000
            else:
                maximumRAM = maxM
            if maximumRAM > maxM:
                maximumRAM = maxM
                print(
                    "Warning: Memory available is less than the asked value, maximumRAM set to "
                    + str(int(maximumRAM / 1000000000)) + "Gb.")
        elif opt == "-f":
            assert arg in (
                "user", "pseudo"
            ), "Feedback option must be either \"user\" or \"pseudo\"."
            feedback = arg
        elif opt == "-c":
            assert int(
                arg) > 0, "Error: numChamps value must be a positive integer"
            k = int(arg)
        elif opt == "-s":
            assert int(
                arg
            ) > 0, "Error: rocchioScope value must be a positive integer"
            n = int(arg)
        elif opt == "-l":
            assert int(
                arg) > 0, "Error: limit value must be a positive integer"
            limit = int(arg)

    if feedback:
        if feedback == "pseudo":
            assert len(
                args
            ) == 4, "Error: if you want to use pseudo feedback, please insert alpha and beta as well"
            rocchioWeights.append(float(args[2]))
            rocchioWeights.append(float(args[3]))
            # rocchioWeights.append(float(args[4]))
        else:
            assert len(
                args
            ) == 5, "Error: if you want to use user feedback, please insert alpha, beta and gamma as well"
            rocchioWeights.append(float(args[2]))
            rocchioWeights.append(float(args[3]))
            rocchioWeights.append(float(args[4]))

    # taking in account the choosen tokenizer, the respective data flow is created
    if tokenizer == "simple":
        assignment3(outputFile, Tokenizer.SimpleTokenizer(), maximumRAM,
                    feedback, n, k, limit, args[0], args[1], rocchioWeights)
    else:  # 'complex' = default tokenizer
        assignment3(outputFile, Tokenizer.ComplexTokenizer(), maximumRAM,
                    feedback, n, k, limit, args[0], args[1], rocchioWeights)

    return 0

예제 #3

파일 보기

파일: rocchio.py 프로젝트: joao-alegria/RI

    :noindex:
.. moduleauthor:: Filipe Pires [85122] & Joao Alegria [85048]
"""

import os
import sys
import psutil
import gc
import getopt

import Tokenizer
import Searcher

inputFolder = "../index/"
inputFiles = os.listdir(inputFolder)
tokenizer = Tokenizer.ComplexTokenizer()
limit = 20
maximumRAM = None
feedback = "user"  #None
N = [5, 10, 20]
k = 10000
rocchioWeights = [1.0, 1.0, 0.1]  #[]

for n in N:
    queriesFile = open("../queries.txt", "r")
    pseudoFeedbackFile = open("../pseudoFeedback/" + str(n) + ".txt", "w")
    userFeedbackFile = open("../userFeedback/" + str(n) + ".txt", "w")

    for line in queriesFile:
        # process each query
        query = line.split("\t")