Exemple #1
0
 def testAdvanceBeforePeepAheadShouldPass(self):
     global tokenizer
     tokenizer=Tokenizer('a b')
     a=tokenizer.advance()
     self.assertEqual(valueof(a),'a')
     b=tokenizer.peepahead()
     self.assertEqual(valueof(b),'b')
Exemple #2
0
def train(training_items=[],
          WS="",
          gazetteer=None,
          tokenize=False,
          tokenizer_context=1,
          left_context=1,
          right_context=1,
          alignment_context=1,
          max_align_dist=1,
          nearest_lev_hyperballs=1,
          max_lev_dist=1,
          min_tok_freq=25,
          min_lem_freq=25):
    """Train and save a new model"""
    ####### train tokenizer ####################################################################################
    if tokenize:
        tokenizer = Tokenizer(context=tokenizer_context,
                              WS=WS)
        tokenizer.train(training_items=training_items,
                        gazetteer=gazetteer)
    ####### train POS-tagger and lemmatizer ################################################################################################
    sequential_tagger = MaxentTagger(WS=WS,
                                     left_context=left_context,
                                     right_context=right_context,
                                     min_tok_freq=min_tok_freq,
                                     min_lem_freq=min_lem_freq)
    sequential_tagger.train(training_items=training_items,
                            gazetteer=gazetteer)
    sequential_tagger.train_lemmatizer(training_items=training_items,
                                       alignment_context=alignment_context,
                                       max_align_dist=max_align_dist,
                                       nearest_lev_hyperballs=nearest_lev_hyperballs,
                                       max_lev_dist=max_lev_dist)
    return
Exemple #3
0
	def printPerWordPerplexity(self,file):
		t = Tokenizer()
		print "---------------Perplexity Per Word-----------------------"

		for line in file:
			line = t.tokenize(line)
			perplexity = self.calculatePerWordPerplexity(line)
			print "%s:\t%f" % (line,perplexity)
Exemple #4
0
	def printPerWordPerplexityInterpolated(self,file,lambda_uni,lambda_bi):
		t = Tokenizer()
		print "---------------Perplexity Per Word-----------------------"

		for line in file:
			line = t.tokenize(line)
			perplexity = self.calculatePerWordPerplexityInterpolated(line,lambda_uni,lambda_bi)
			print "%s:\t%f" % (line,perplexity)
Exemple #5
0
 def testadvanceMultipleTime(self):
     global tokenzier
     tokenizer=Tokenizer('a b c ;')
     a=tokenizer.advance()
     self.assertEqual(valueof(a),'a')
     b=tokenizer.advance()
     self.assertEqual(valueof(b),'b')
     c=tokenizer.advance()
     self.assertEqual(valueof(c),'c')
Exemple #6
0
 def testadvanceShouldReturnEndWhenNoMoreToken(self):
     global tokenzier
     tokenizer=Tokenizer('a')
     a=tokenizer.advance()
     self.assertEqual(valueof(a),'a')
     end=tokenizer.advance()
     self.assertEqual(valueof(end),'(end)')
     end=tokenizer.advance()
     self.assertEqual(valueof(end),'(end)')
Exemple #7
0
 def testAdvanceBeforePeepAheadTwiceShouldPass(self):
     global tokenizer
     tokenizer=Tokenizer('a')
     a=tokenizer.advance()
     self.assertEqual(valueof(a),'a')
     end=tokenizer.advance()
     self.assertEqual(valueof(end),'(end)')
     end=tokenizer.peepahead()
     self.assertEqual(valueof(end),'(end)')
Exemple #8
0
    def __init__(self, depth):
        tokenizer = Tokenizer()
        tokenizer.on_word = lambda x: self.__on_word(x)
        self.tokenizer = tokenizer

        root = Node(Shared.make_key([""]))
        self.nodeByWords = {root.key: root}
        self.depth = depth
        self.history = [root]
        self.wordCount = 0
Exemple #9
0
	def printPerWordPerplexityInterpolated(self,file,lu,lb,lt):
		t = Tokenizer()
		print "---------------Perplexity Per Word-----------------------"

		pReturn = []
		for line in file:
			line = t.tokenize(line)
			perplexity = self.calculatePerWordPerplexityInterpolated(line,lu,lb,lt)
			print "%s:\t%f" % (line,perplexity)
			pReturn.append((line,perplexity))
def preprocessing(keywords):
	'''Preprocessing the input keywords'''
	global start
	# print "use-preprocessing: ",time.clock()-start
	
	tokens = tok.normalizer(keywords.lower())
	tokens = tok.stop_words(tokens)
	tokens = tok.stemmer(tokens)
	tokens = tok.lemmatizer(tokens)

	return tokens
Exemple #11
0
def phraseQuery(phrase, trie):
    # First we will tokenize the phrase using our tokenizer we used to make the index
    tokenizer = Tokenizer()
    phrase = tokenizer.stemQuery(phrase)
    result = []

    # Now get the occurrence dictionary for each word and append it to result
    for word in phrase:
        occurrences = trie.getOccurrences(word)
        # If the dict is empty do not append it
        if (occurrences != []):
            result.append(occurrences)

    # We now know that if length of result is smaller than phrase that we have no phrase matches
    # so we can return an empty list as the result
    # or no matches or something
    if (len(phrase) != len(result)):
        return []

    # Now for each key we must see if its in the next one, and if it is make sure the positions are correct
    # If we wanted to optimize this further we would make it choose the word with the smallest amount of occurrences
    # But that complicates things and goes above what is required
    result2 = set()
    requiredMatches = len(result) - 1

    # Same thing, check for no occurences
    if result == []:
        return set()

    firstTerm = result[0]

    # print(result)
    for docID in firstTerm:
        # For each docID we must compare each position with position+1 in the other dictionaries
        matches = 0
        positions = firstTerm[docID]
        # print(result, positions)
        # If 0 occurrences
        if positions == 0:
            continue
        for position in positions:
            position2 = position + 1
            for i in range(1, requiredMatches + 1):
                # Make sure word appears in the same document and position+1 exists
                if (docID in result[i] and position2 in result[i][docID]):
                    matches += 1
                position2 += 1

                # print(docID, position, position2, positions, matches)
        if (matches == requiredMatches):
            result2.add(docID)

    return result2
Exemple #12
0
 def testAdvanceTokenShouldReturnThreeWordWithOneFinalEnd(self):
     global tokenzier
     tokenizer=Tokenizer('a b c ')
     gen=tokenizer.advanceToken()
     a=next(gen)
     self.assertEqual(a,'a')
     b=next(gen)
     self.assertEqual(b,'b')
     c=next(gen)
     self.assertEqual(c,'c')
     d=next(gen)
     self.assertIsNone(d)
Exemple #13
0
def tag(test_items=[],
        WS="",
        tokenize=False,
        tokenizer_context=0,
        left_context=1,
        right_context=1,
        gazetteer=None,
        min_tok_freq=25,
        min_lem_freq=25,
        mode=""):
    """
    Tags a list of (potentially annotated) test tokens.
    """
    tokenized_tokens = []
    token_acc, token_f1 = None, None
    if tokenize:
        # load and apply a tokenizer:
        tokenizer = Tokenizer(context=tokenizer_context, WS=WS)
        if mode == "tag":
            tokenized_tokens = tokenizer.tokenize(test_items=test_items, gazetteer=gazetteer)    
        elif mode in ("test", "crossval"):
            items = []
            for item in test_items:
                if item == "<utt>":
                    items.append(item)
                else:
                    items.append(item[0].lower())
            token_acc, token_f1 = tokenizer.eval_tokenizer(test_items=items, gazetteer=gazetteer)
            # return the original tokens since we only tokenize for evaluation purposes:
            tokenized_tokens = items
    else:
        # assume the input has been properly tokenized already:
        if mode == "tag":
          tokenized_tokens = test_items
        elif mode == "test":
          tokenized_tokens = tuple(item[0].lower() for item in test_items)
    sequential_tagger = MaxentTagger(WS=WS, 
                                     left_context=left_context,
                                     right_context=right_context,
                                     min_tok_freq=min_tok_freq,
                                     min_lem_freq=min_lem_freq)
    sequential_tagger.load_models()
    tagged_items = sequential_tagger.tag(tokenized=tokenized_tokens, gazetteer=gazetteer)
    if mode in ("crossval", "test"):
        results = sequential_tagger.evaluate_tags_and_lemmas(gold_items=test_items,\
                                                             silver_items=tagged_items)
        if tokenize:
            results.extend((token_acc, token_f1))
        return results
    else:
        return tagged_items
Exemple #14
0
def determine_usage(resultObj, categorizerObj):

    def deep_find(val, iterable):

        if (not isinstance(iterable, list)) and (not isinstance(iterable, tuple)):
            if val == iterable:
                return True
        else:
            for element in iterable:
                if deep_find(val, element):
                    return True
        return False


    assignments = categorizerObj.tokenCategories.assignments
    other = categorizerObj.tokenCategories.other
    usage = list()
    out =list()

    if resultObj.alias:
        searchPattern = resultObj.alias
    else:
        searchPattern = resultObj.name.split(".")[-1]

    for assignment in assignments:
        aux = list()
        tokValues = [val[1] for val in assignment]
        index = find_all_indices(searchPattern, tokValues)
        if index:
            aux.append(tokenizer.untokenize(assignment))
            aux.append(assignment[0][2][0])
            usage.append(aux)

    for element in other:
        aux = list()
        tokValues = [val[1] for val in element]
        index = find_all_indices(searchPattern, tokValues)
        if index:
            aux.append(tokenizer.untokenize(element))
            aux.append(element[0][2][0])
            usage.append(aux)

    # out = list()
    for case in usage:
        obj = Usage()
        obj.name = resultObj.name
        obj.usage = case[0]
        obj.lineNumber = case[1]
        obj.parent = categorizerObj.file
        out.append(obj)
    return out
Exemple #15
0
def TF(documents, outputFormat='sparseMatrix', progressBar=True):
    '''
    Parameters:
        documents: string, list
            an string contains a folder name of corpus or list of documents
        outputFormat: string
            'sparseMatrix' or 'pandas_DataFrame'
    returns:
        <class dict>
        { <class str> documentName: <class dict>
                                    { <class Token> : <class int> frequency }}
    '''
    if type(documents) == str:
        if not path.isdir(documents):
            raise ValueError("you must give me a folder name or list of files")
        documents = [
            path.join(documents, i) for i in listdir(documents)
            if path.isfile(path.join(documents, i))
        ]
        if len(documents) == 0:
            raise ValueError("this folder is empty or has no file!!!")
    if type(documents) == list:
        if not all(path.isfile(i) for i in documents):
            raise ValueError("no file is this directory!!!")
    else:
        raise ValueError("you must give me a folder name or list of files")

    if outputFormat not in ['sparseMatrix', 'pandas_DataFrame']:
        raise ValueError(
            "The outputFormat must be 'sparseMatrix' or 'pandas_DataFrame'")

    if not progressBar:
        tqdm = lambda x: x
    else:
        from tqdm import tqdm

    TF_mat = {}
    for fileName in tqdm(documents):
        if not fileName in TF_mat:
            fileObj = open(fileName)
            TF_mat.update({
                fileName:
                tok.wordCounter(tok.wordTokenizer(fileObj.read())[0])
            })
            fileObj.close()
    if outputFormat == 'sparseMatrix':
        return TF_mat
    elif outputFormat == "pandas_DataFrame":
        return pd.DataFrame(TF_mat,
                            columns=sorted(documents)).fillna(0).sort_index()
Exemple #16
0
 def __init__(self, inp):
     self.toker = Tokenizer(inp)
     self.saved = []  # pushed-back tokens
     self.first = True
     self.currentFile = ""
     self.currentSymTable = {}
     self.localCounter = 0
     self.className = ""
     self.classTable = {}
     self.ifCounter = 0
     self.elseCounter = 0
     self.whileCounter = 0
     self.fieldCounter = 0
     self.staticCounter = 0
Exemple #17
0
def Driver():
    #Get Acorn script from input
    try:
        argument = sys.argv[1]
    except:
        sys.exit(CYAN + "Acorn: " + RED + "Expected acorn script." + WHITE)

    #test to see if input type is correct
    if (not argument.lower().endswith('.acorn')):
        sys.exit("Acorn: Expected acorn file.")
    dataFile = open(argument, "r")
    #raw is a whole string which is the whole Acorn script
    raw = dataFile.read()
    dataFile.close()
    acorn = Lexer.LexerClass()
    acorn.lexer(raw)

    #Send tokens to tokenizer and get an Abstract syntax tree back: ast is a 2nd array
    acornStackFrame = acorn.stackFrame
    Mem = Memory.Memory()
    #s = time.time()
    for i in range(0, len(acornStackFrame)):
        subStack = acornStackFrame[i]
        #print(subStack)
        ast = Tokenizer.Tokenizer(subStack, Mem)
        #print(Mem.heap)
        astp = ast.grammar()
        #print(astp)
        #print(astp.expr1())
        #print(astp)
        Parser.step(astp, Mem)
Exemple #18
0
def main(argv):

    # Open file and handle any errors
    f = open(argv[0], "r")

    # Init file used to track our globals
    t.tokenizer = Tokenizer.Tokenizer(f)

    # Init top-level object
    program = Prog.Prog()

    # Form parse tree, which recursively calls parse() on each nonterminal
    program.parse()

    # Recursively print parse tree out
    print("\nprint() output: ")
    program.print()

    # Recursively execute parse tree
    print("\nexec() output: ")
    program.exec()

    # Close file
    f.close()

    # Exit
    exit(0)
Exemple #19
0
class TweetsHandler():
    tokenizer = Tokenizer()

    def tweetsToWords(self, filename):
        tweetTokens = []
        with open(fname, 'r') as f:
            for line in f:
                tweet = json.loads(line)

                if 'text' not in tweet.keys():
                    continue

                #lineTokens = self.tokenizer.tokenize(tweet['text'])
                lineTokens = nltk.word_tokenize(tweet['text'])
                print lineTokens

                tagged = nltk.pos_tag(lineTokens)
                tokenJJ = [
                    term for term in tagged
                    if (term[1] == 'JJ' or term[1] == 'JJR' or term[1] == 'JJS'
                        )
                ]  # only adjective

                tweetTokens += tokenJJ

        return tweetTokens

    def countAssociation(self, tweetTokens):
        cnt = Counter()
        cnt.update(tweetTokens)
        return cnt.most_common(20)
Exemple #20
0
    def gather(self, tokens):
        """
		Gathers words within program for execution
		"""
        defined_words = {}
        current_word = []
        found_word = 0
        import_called = 0
        for token in tokens:
            if import_called:
                import_called = 0
                token = "..\\" + token + ".pyfth"
                #print token
                imported_program = open(token)
                defined_words.update(
                    self.gather(Tokenizer.tokenize(imported_program)))
                #print defined_words
                imported_program.close()
            elif found_word:
                if token == ";":
                    found_word = 0
                    current_word.append("return")
                    defined_words[current_word_name] = current_word
                    current_word = []

                else:
                    current_word.append(token)
            elif token == "import":
                import_called = 1

            else:
                #print "found word: ",token
                found_word = 1
                current_word_name = token
        return defined_words
Exemple #21
0
def analyzeT(jackFile):
    tokenizedXmlFilename = os.path.splitext(jackFile)[0] + "T.xml.cmp"
    outputFile = open(tokenizedXmlFilename, 'w')
    outputFile.write("<tokens>\r\n")
    t = Tokenizer.Tokenizer(jackFile)
    t.advance()
    while t.hasMoreTokens():
        tokenType = t.tokenType()
        if tokenType == Tokenizer.TokenType.KEYWORD:
            outputFile.write("<keyword> " + t.keyword() + " </keyword>")
        elif tokenType == Tokenizer.TokenType.SYMBOL:
            outputFile.write("<symbol> " + charXMLify(t.symbol()) +
                             " </symbol>")
        elif tokenType == Tokenizer.TokenType.IDENTIFIER:
            outputFile.write("<identifier> " + t.identifier() +
                             " </identifier>")
        elif tokenType == Tokenizer.TokenType.INT_CONST:
            outputFile.write("<integerConstant> " + t.intVal() +
                             " </integerConstant>")
        elif tokenType == Tokenizer.TokenType.STRING_CONST:
            outputFile.write("<stringConstant> " + t.stringVal() +
                             " </stringConstant>")
        else:
            pdb.set_trace()
            print("Invalid")
        outputFile.write("\r\n")
        t.advance()
    outputFile.write("</tokens>")
    outputFile.write("\r\n")
    outputFile.close()
Exemple #22
0
 def analyze(self):
     for jack_file in self.jackFiles:
         tokenizer = T.Tokenizer(jack_file)
         xml_file = jack_file.replace('.jack', '.xml')
         comp_engine = CE.Parsing(tokenizer, xml_file)
         comp_engine.outFile.close()
         tokenizer.close()
Exemple #23
0
def main():
    program = sys.argv[1]
    #tk = Tokenizer.Tokenizer(program)#Create program to be tokenized, based on the rules of the languages
    files = [
        "validAllOneLine.txt", "validAllSimpleExpressions.txt",
        "validBooleanComplex.txt", "validComplexExpressions.txt",
        "validMinimalWhitespace.txt", "validTypicalIfElse.txt",
        "validTypicalLoop.txt"
    ]
    #for i in range(len(files) - 1):
    #program = "validTypicalIfElse.txt"
    tk = Tokenizer.Tokenizer(program)
    while (tk.lcurrentToken() != 'EOF'):  #Search through all the tokens
        tk.lprocessTokens()  #Print out code
        tk.lnextToken()
    tk.lprocessTokens()
    tk.lcloseFile()
    tk.tokens.append(tok.Token('end', 5))

    parser = nodes.ProgramNode()
    parser.parseProgram(tk)
    #print("PARSE COMPLETE!!")
    #print("==================")
    #print(nodes.symTab)
    #print(" ")
    parser.printProgram()
    parser.execProgram()
def create_spacy_hu():
    nlp = spacy.blank('hu')
    nlp.tokenizer = Tokenizer.HuTokenizer(nlp.vocab)

    morph_analyzer = LemmatizerMorphAnalyzer.HuLemmaMorph(nlp)
    nlp.add_pipe(morph_analyzer)

    constitutency_parser = ConstitutencyParser.ConstitutencyParser(nlp)
    nlp.add_pipe(constitutency_parser)

    dependency_parser = DependencyParser.DependencyParser(nlp)
    nlp.add_pipe(dependency_parser)

    np_chunker = NPChunker.NPChunker(nlp)
    nlp.add_pipe(np_chunker)

    POS_analyzer = POSTagger.HuPOSTagger(nlp)
    nlp.add_pipe(POS_analyzer)

    preverb_identifier = PreverbIdentifier.PreverbIdentifier(nlp)
    nlp.add_pipe(preverb_identifier)

    hu_word_to_vec = HuWordToVec.HUWordToVec()
    nlp.add_pipe(hu_word_to_vec)

    return nlp
Exemple #25
0
def scan_file(file, stdLib):

    # make sure to use pathlib.Path objects otherwise throw an arror
    try:
        file = pathlib.Path(file)
    except TypeError:
        raise TypeError(
                "input <{error_cause}> for 'file' does not match {type_name}".format(
                        type_name = pathlib.Path,
                        error_cause = str(file)
                    )
            )

    if file in stdLib:
        return False

    tok = tokenizer.TokenCategorizer(file)
    modules, callables = imported_modules(tok)
    lokalClasses, lokalFunctions = lokal_callables(tok)

    out = {
            "modules": modules,
            "callables": callables,
            "lokalClasses": lokalClasses,
            "lokalFunctions": lokalFunctions
        }

    return out
Exemple #26
0
def compile_file(jack_file_name, xml_file_name):
    # print("Starting compilation.\nSource file: "+jack_file_name+"\nDestination file: "+xml_file_name+"\n")
    jack_file = open(jack_file_name, 'r')
    tokenizer = Tokenizer.Tokenizer(jack_file)
    xml_file = open(xml_file_name, 'w')
    compilation_engine = CompilationEngine(tokenizer, xml_file)
    compilation_engine.compile_class()
Exemple #27
0
 def run(code):
     Parser.tokens = Tokenizer.Tokenizer(code)
     Parser.tokens.select_next()
     r = Parser.parse_program()
     if Parser.tokens.actual.value == 'EOF':
         return r
     else:
         raise Exception(f'Expected EOF instead got {Parser.tokens.actual.value}')
Exemple #28
0
def compile_file(jack_file_name, vm_file_name):
    jack_file = open(jack_file_name, 'r')
    tokenizer = Tokenizer.Tokenizer(jack_file)
    symbol_table = SymbolTable.SymbolTable()
    vm_file = open(vm_file_name, 'w')
    vm_writer = VMWriter.VMWriter(vm_file)
    compilation_engine = CompilationEngine(tokenizer, vm_writer, symbol_table)
    compilation_engine.compile_class()
 def labelQueryTerm(self, tweetsList):
     tokenizer = Tokenizer.Tokenizer()
     for tweet in tweetsList:
         termsInTweets = tokenizer.tokenize(tweet[2], 'simple')
         for term in termsInTweets:
             if term in list(self.queryDict.keys()):
                 self.queryDict[term].append(tweet[1])
     self._scoreQueryTerms()
Exemple #30
0
 def run(code):
     Parser.tokens = Tokenizer(code)
     result = Parser.program()
     if Parser.tokens.actual.type == 'EOF':
         return result
     else:
         raise SyntaxError(
             "Invalid Chain Exception (tip: do not put spaces between numbers)"
         )
Exemple #31
0
def Number():
    token = Tokenizer.PeakToken()
    negate = token.IsSymbol(['-'])
    if token.IsSymbol(['+','-']):
        # ignore unary +
        # use negate for unary -
        Tokenizer.Consume()
        token = Tokenizer.PeakToken()

    if token.IsNumber():

        Tokenizer.Consume()
        text = ('-' if negate else '') + token.text
        val = float(text) if '.' in text else int(text)
        return ConstantFunction(val)

    else:
        raise ValueError("Invalid Number")
Exemple #32
0
    def index(self):
        for doi, title, abstract in self.col:
            if self.tokenizerType == '0':  # simple
                tokenizer = Tokenizer.SimpleTokenizer(title, abstract)
            else:  # better
                tokenizer = Tokenizer.BetterTokenizer(title, abstract)

            terms = tokenizer.getTerms()
            for term in terms:
                if term in self.term_map.keys():
                    if doi in self.term_map[term].keys():
                        self.term_map[term][doi] += 1
                    else:
                        self.term_map[term][doi] = 1
                else:
                    term_freq_map = {}  # key: docId, value: term_freq
                    term_freq_map[doi] = 1
                    self.term_map[term] = term_freq_map
 def run(code):
     Parser.tokens = tkr.Tokenizer(code)
     Parser.tokens.selectNext()
     node = Parser.parseBlock()
     current = Parser.tokens.actual
     if current.type == "EOF":
         return node
     else:
         raise Exception("Tokenizer nao chegou no EOF")
    def __init__(self, input, output):
        """

        :param input: input file name
        :param output: output file name whhere the text will be written
        """
        self.tokenizer = Tokenizer.Tokenizer(input)
        self.parsedrule = []
        self.output = open(output, "w")
        self.indent = ""
Exemple #35
0
def TermTail(leftParseTree):
    token = Tokenizer.PeakToken()
    if token.IsSymbol(['+','-']):

        Tokenizer.Consume()

        parseTree = BinaryFunction(token.text)
        parseTree.lchild(leftParseTree)
        parseTree.rchild(Term())

        return TermTail(parseTree)

    elif token.IsEOF() or \
            token.IsSymbol(')'):

        return leftParseTree

    else:
        raise ValueError("Term Tail")
Exemple #36
0
def getBinTestText(fileName) :
    fileText = FileIO.readFile(fileName); 
    tokens = Tokenizer.tokenizer(fileText);
    wordOccurence = {};
    resultTokens = "";
    for token in tokens.split("\n"):
        if token not in wordOccurence:
            wordOccurence[token] = 1;
            resultTokens = resultTokens + token + "\n";
    return resultTokens;
Exemple #37
0
def error_printer(error_suggestion, sentence_list):
    for item in error_suggestion:
        args = Tokenizer.mapping(item[0], sentence_list)
        print '第', args[0] + 1, '句句子中第', args[1] + 1, '个词', item[1], '出现拼写错误;'
        print '可能正确的词为:',
        num = 1
        for word in item[2]:
            print num, ')', word,
            num += 1
        print '\n'
Exemple #38
0
def Term():
    token = Tokenizer.PeakToken()
    if (token.IsNumber() or \
        token.IsFunc() or \
        token.IsVariable() or \
        token.IsSymbol(['+', '-', '('])):

        parseTree = Factor()
        return FactorTail(parseTree)
    else:
        raise ValueError("Term")
Exemple #39
0
def analyze(jackFile):
    outputFilename = os.path.splitext(jackFile)[0] + ".xml.cmp"
    t = Tokenizer.Tokenizer(jackFile)
    ce = CompilationEngine.CompilationEngine(t, outputFilename)

    t.advance()
    if t.keyword() != "class":
        print("jack file does not have a class!")
        exit(1)

    ce.CompileClass()
Exemple #40
0
def Expression():
    token = Tokenizer.PeakToken()
    if (token.IsNumber() or \
        token.IsFunc() or \
        token.IsVariable() or \
        token.IsSymbol(['+', '-', '('])):

        parseTree = Term()
        return TermTail(parseTree)
    else:
        raise ValueError("Expression")
Exemple #41
0
    def __init__(self, input, output):
        """

        :param input: input file name
        :param output: output file name whhere the text will be written
        """
        self.tokenizer = Tokenizer.Tokenizer(input)
        self.writer = VMWriter.VMWriter(output)
        self.symbolTable = SymbolTable.SymbolTable()
        self.classname = ""
        self.name = ""
Exemple #42
0
 def get_queries(self):
     queries, queries_1 = self.read_queries()
     tokenize = Tokenizer.Tokenize(" ")
     self.modified_queries = tokenize.process_data(queries)
     self.modified_queries_1 = queries_1
     f = open("queries for lucene.txt", 'w')
     i = 0
     for q in self.modified_queries:
         q = q.strip("\n")
         q = q.replace("\n", ' ')
         f.write(str(q))
         f.write("\n")
Exemple #43
0
def idf():
  IDF = {}
  numDocs = 0
  # Get all the products from the database
  dat = data.getDatabase()
  for table in dat:
    #print '.'
    # Go through each product in each table
    for product in dat[table]:
      item = product[3]
      # Get their reviews
      revs = GetReviews.readReview(item)["Reviews"]
      try:
        for r in revs:
          #print r
          # Tokenize and Stem reviews
          con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons']))
          pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros']))
          comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments']))
          #print 'Before:',r['Cons'],'\n\nAfter:',con
          # Count unique tokens in the document
          for token in list(set(con) | set(pro) | set(comment)):
            if token in IDF: IDF[token] = IDF[token] + 1
            else: IDF[token] = 1
            numDocs = numDocs + 1
            # Increment the number of documents
      except: pass
  # Calculate and return the idf score
  for term in IDF:
    IDF[term] = math.log(float(numDocs)/float(IDF[term]))
  pickle.dump(dict(IDF),open('../data/idf.pickle','wb'))  # Pickling saves SOOO much time
  return IDF
Exemple #44
0
def tf_idf():
  TF_IDF = {}
  # Load the inverse document frequencies
  #IDF = idf()
  IDF = dict(pickle.load(open('../data/idf.pickle','rb')))
  dat = data.getDatabase()	# get all of the products
  for table in dat:
    print '.'	# progress marker
    for product in dat[table]:	# For each product in each table
	item = product[3]	# Item number is [3] in the tuple
	revs = GetReviews.readReview(item)["Reviews"]	# we want to read the actual reviews
	product_review = []
	try:
	  for r in revs:	# for each review
	    tf = {}
		# Tokenize and stem the entire review
	    con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons']))
	    pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros']))
	    comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments']))
		# combine pros, cons, and comments sections
	    for token in list(con+pro+comment):		# calculate the term frequencies
		if token in tf: tf[token] = tf[token] + 1
		else: tf[token] = 1
	    for t in tf:
	    	tf[t] = float(1+math.log(tf[t]))*IDF[t] # calculate tf-idf score
	    product_review.append(tf)	# add to list of reviews
	except: pass
	TF_IDF[item] = product_review	# add list of reviews to the dictionary
  return TF_IDF
Exemple #45
0
 def process(self, line, is_inside_until=False):
     statement = Tokenizer(line.strip(' ').strip('\n'), ' ')
     while statement.has_next():
         token = statement.next()
         if token in ['.', '.s', 'CR']:
             if token == '.':
                 sys.stdout.write(self.stack.pop())
                 self.usedWrite = True
             elif token == 'CR':
                 print
             else:
                 print self.stack
         elif token in self.memory:
             self.process(self.memory[token])
         elif token in ['DUP', '+', '-', '*', '/', 'SWAP', 'DROP', '<', '>', '<=', '>=', '=', 'EMIT', 'MOD', 'KEY',
                        'DEPTH', 'ROLL', 'PICK']:
             import Operators
             Operators.Op[token](self.stack)
         elif token == ':':
             self.function_definition(statement)
         elif token == 'IF':
             self.handle_if(statement)
         elif token == 'DO':
             self.do_loop(statement)
         elif token == 'BEGIN':
             self.until_loop(statement)
         elif token == 'LEAVE':
             if is_inside_until and self.stack.pop() != '0':
                 return False
         elif token == 'EXPECT':
             self.expect()
         elif token.startswith('."'):
             self.print_string(token, statement)
         elif token.isdigit():
             self.stack.push(token)
         elif token.strip() == '':
             pass
         else:
             raise NameError('Invalid Input: ' + token)
     return True
Exemple #46
0
	def _testParsing(self, st, *L):
		'''Tests that parsing a particular string all at once will return the given
		sequence of tokens. The Wait token at the end is implicit.'''
		# TODO: Test parsing it character-by-character too!
		#print 'Testing '+repr(st)
		L = self._collapseText(list(L)) + [WaitToken()]
		p = Tokenizer()
		p.queueData(st)
		resultToks = []
		while 1:
			tok = p.getNextToken()
			resultToks.append(tok)
			if isinstance(tok, WaitToken):
				break
		resultToks = self._collapseText(resultToks)
		self.assertEqual(len(L), len(resultToks), "Error parsing '%s': Expected token list %s, got %s" % \
							(repr(st), repr(L), repr(resultToks)))
		for tok,result in zip(L,resultToks):
			self.assertEqual(tok, result, \
								"Error parsing '%s': Expected %s but got %s" % \
							 	(repr(st), repr(L), repr(resultToks)))
		self.assertEqual(p.getNextToken(), WaitToken(),
							'When waiting, repeated calls to getNextToken() should '+\
							'continue to return WaitForMoreDataTokens')
Exemple #47
0
 def testPeepAheadOnce(self):
     global tokenizer
     tokenizer=Tokenizer('a b')
     a=tokenizer.peepahead()
     self.assertEqual(valueof(a),'a')
     a=tokenizer.peepahead()
     self.assertEqual(valueof(a),'a')
     a=tokenizer.advance()
     self.assertEqual(valueof(a),'a')
     b=tokenizer.peepahead()
     self.assertEqual(valueof(b),'b')
     b=tokenizer.peepahead()
     self.assertEqual(valueof(b),'b')
     b=tokenizer.advance()
     self.assertEqual(valueof(b),'b')
Exemple #48
0
def buildCorpus(dirPath, corpusName):
    print('creating corpus!!');
    tagClass = fetchLabel(dirPath);
    classCnt = FileIO.countFiles(dirPath);
    dictionary = {};
    FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' ));
    for dir_entry in os.listdir(dirPath):
        text = FileIO.readFile(os.path.join(dirPath, dir_entry));
        text = Tokenizer.tokenizer(text);
        for token in text.split('\n'):
            if token not in dictionary:
                dictionary[token] = {};
            if tagClass not in dictionary[token]:
                dictionary[token][tagClass] = 0;
            dictionary[token][tagClass] = dictionary[token][tagClass] + 1;
    for key, value in dictionary.items():
        FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n'));
    print('Corpus creation : Done..');
Exemple #49
0
def parse(str):
    array=[]
    CKeyword.defineTable={}
    CKeyword.stringTable={}
    tokenizer=Tokenizer(str)
    token=tokenizer.peepahead()
    CExpression.configure_tokenizer_Expression(tokenizer)
    CKeyword.configure_tokenizer_Keyword(tokenizer)
    while(token.first == ';'):
        tokenizer.advance()
        token=tokenizer.peepahead()
    while(token.first != '(end)'):
        if hasattr(token,'std'):
            temp=CKeyword.parseStatement()
        else:
            temp=CExpression.expression(0)
            tokenizer.advance(';')
        token=tokenizer.peepahead()
        array.append(temp)
    return array
Exemple #50
0
def BuildTrainingSet():
	# initialize variables
  database = data.getDatabase()
  IDF = dict(pickle.load(open('../data/idf.pickle','rb')))
  numReviews = 0
  posReview = {}
  numPos = 0
  negReview = {}
  numNeg = 0
		# For each product in each subcategory
  for table in database:
    for product in database[table]:
	item = product[3]
	revs = GetReviews.readReview(item)["Reviews"]
		# get the review
	try:
	  for r in revs:
	    if (numReviews%37)==0:	# Analyze every 37th review
	  	tf = {}
		  # Get reviews for you to read
	  	con = r['Cons']
	  	pro = r['Pros']
	  	comment = r['Comments']
		  # Read the reviews
	  	print pro,' :: ',con,' :: ',comment
		  # set up to add to training set
	  	con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons']))
	  	pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros']))
	  	comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments']))
		  # Treat all parts as one review
	  	for token in list(con+pro+comment):
		  if token in tf: tf[token] = tf[token] + 1
		  else: tf[token] = 1
	  	for t in tf:
		    # tf-idf formula
	    	  tf[t] = float(1+math.log(tf[t]))*IDF[t]
		    # hopefully you have had time to read, now decide
	    	Q = int(raw_input('\n1 for good.... 0 for bad.....\n').rstrip('\n'))
	    	if Q==1:	# Good
		  posReview[numPos] = tf	# add to training set
		  numPos = numPos + 1
		elif Q==0:	# Bad
		  negReview[numNeg] = tf	# add to training set
		  numNeg = numNeg + 1
		else: print 'FAIL!!!!!!'

	    numReviews = numReviews + 1		# increase number of reviews
	except: pass
  saveSet(posReview,negReview)	# Save the training sets
  return (numPos, numNeg)
Exemple #51
0
def buildBinarizedCorpus(dirPath, corpusName):
    print('creating binarized corpus!!');
    tagClass = fetchLabel(dirPath);
    classCnt = FileIO.countFiles(dirPath);
    FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' ));
    corpusDict = {};
    for dir_entry in os.listdir(dirPath):
        fileTokens = {};
        text = FileIO.readFile(os.path.join(dirPath, dir_entry));
        text = Tokenizer.tokenizer(text);
        for token in text.split('\n'):
            if token not in fileTokens:
                fileTokens[token] = 1;
                if token not in corpusDict:
                    corpusDict[token] = {};
                    corpusDict[token][tagClass] = 1;
                else:
                    corpusDict[token][tagClass] = corpusDict[token][tagClass] + 1;
    for key, value in corpusDict.items():
        FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n' ));
    print('binarized corpus creation done!!');
Exemple #52
0
 def testAdvanceShouldReturnEndWhenGivenNone (self):
     global tokenzier
     tokenizer=Tokenizer(None)
     end=tokenizer.advance()
     self.assertEqual(valueof(end),'(end)')
remove_999= True
translate = True
noise_variance = 0.
n_classes='multiclass'
datapath = 'Datasets/kaggle_higgs/'

# Import the data:
print(" ")
# Load the dataset:
print("Loading dataset...")
train_s, train_s_2, valid_s, test_s = tokenizer.extract_data(
                                                        split = True,
                                                        normalize = normalize,
                                                        remove_999 = remove_999,
                                                        translate = translate,
                                                        noise_variance = 0.,
                                                        n_classes = n_classes,
                                                        datapath = datapath,
                                                        train_size = 180000,
                                                        train_size2 = 35000,
                                                        valid_size = 35000)
train_set_x = []
train_set_x_2 = []
valid_set_x = []
test_set_x = []

for i in range(len(train_s[1])):
    train_set_x.append(theano.shared(np.asarray(train_s[1][i],
                                                dtype=theano.config.floatX),
                                     borrow= True))
    train_set_x_2.append(theano.shared(np.asarray(train_s_2[1][i],
Exemple #54
0

path = ''
if len(sys.argv)==2:
	path = sys.argv[1]
else:
	print "No path input"
	sys.exit()

bc = BookCleaner(path)

reload(sys)
sys.setdefaultencoding('utf8')

#AGGREGATE RAW TEXT
text_file = open("big_training_raw.txt", "w+")
print "BookCleaner is now aggregating all texts"
str = bc.getAllFilesCleaned()
text_file.write(str)
text_file.close()
#TOKENIZE
print "Tokenizing texts"

text_file = open("tokenized_train.txt","w+")
file = open("big_training_raw.txt")
tk = Tokenizer()
for line in file:
		line = unicode(line, errors='replace')
		str=tk.tokenizeAdvanced(line)
		text_file.write(str)
text_file.close()
Exemple #55
0
def main():
    a = "這邊介紹一個叫做get的method,只適用於dictionary"
    tk.tokenize(a)
import Modded_PyfthTools
import Tokenizer
import sys

Pyfth_instance = PyfthTools.Pyfth()
pyfth_file_name = sys.argv[1]

if pyfth_file_name[-6:] != ".pyfth":
	pyfth_file_name += ".pyfth"
pyfth_file = open(pyfth_file_name)

pyfth_file_tokens = Tokenizer.tokenize(pyfth_file)
Pyfth_instance.input_tokens(pyfth_file_tokens)
Pyfth_instance.run()
print "----------------------------- DATASET ----------------------------"
data = 'higgs' # 'mnist.pkl.gz'
print "Dataset studied :                {0}".format(data)
normalize= True
remove_999= True
translate = True
noise_variance = 0.
n_classes='binary',

# Import the data:
print(" ")
# Load the dataset:
print("Loading dataset...")
train_s, valid_s, test_s = tokenizer.load_higgs(split= True,
                                           normalize= normalize,
                                           remove_999= remove_999,
                                           noise_variance = noise_variance,
                                           n_classes= n_classes,
                                           translate= True)
train_set_x = train_s[1]
train_set_y = train_s[2]

valid_set_x = valid_s[1]
valid_set_y = valid_s[2]
test_set_x = test_s[1]


# Only one DNN is requiered:
print "--------------------------- PARAMETERS ---------------------------"
print "---- LEARNING: "
epochs              = 2
print "    Number of epochs :               {0}".format(epochs)
Exemple #58
0
	def getPerWordPerplexity(self,line):
		t = Tokenizer()
		line = t.tokenize(line)
		perplexity = self.calculatePerWordPerplexity(line)
		return -perplexity
Exemple #59
0
import sChecker.spell_checker as spell
import readline
import Tokenizer as tkr
class color:
    END = '\033[0m'
    COW = '\033[0;33m'
    MIW = '\033[0;91m'

while True:
    try:
        _input = input("You >> ")
        c_list = []

        #_word = input("Enter the word >> ")
        _list = tkr.tokenize(_input)

        for _word in _list:
            #word = spell.words(_word)
            c_word = spell.correct(_word)
            c_list.append(c_word)
            
        for i in range(len(c_list)-1):
            print('\t'+color.MIW+_list[i]+color.END+' -> '+color.COW+c_list[i]+color.END)

        print('')


        #if (_word == c_word):
            #print("Correct Word : "+color.COW+c_word+color.END)
        #else:
import Tokenizer
from math import log

i = input('Enter within quotes, m for movie reviews corpus,'
          'r for reuters corpus( default is reuters) : ')

corpus=''

if i=='m' or i=='M':
    corpus='mr'
else:
    corpus='reuters'

start_time = time.time()

list_fileids =  Tokenizer.get_list_fileids(corpus)

#val = my_dict.get(key, mydefaultval)
##1)Create a dictionary with word as key and list of documents where it occurs in sorted order as value

word_doc_dict={}

##2)Loop through the dataset, to get the entire text from  each file

for (file_index,file_name) in enumerate(list_fileids):
    list_words = Tokenizer.get_list_tokens_nltk(corpus,file_name)

##3) Parse the string to get individual words

    #!!!!!!!!------Possible Improvement: Stemming--------------#