Beispiel #1
0
def process(filename):
    print >> sys.stderr, "Processing " + filename
    doc = folia.Document(file=filename)

    freqlist = FrequencyList()

    if settings.n == 1:
        for word in doc.words():
            text = word.toktext()
            if settings.casesensitive: text = text.lower()
            freqlist.count(text)
    elif settings.sentencemarkers:
        for sentence in doc.sentences():
            for ngram in Windower(sentence.words(), settings.n):
                text = ' '.join([x for x in ngram.toktext()])
                if settings.casesensitive: text = text.lower()
                freqlist.count(text)
    else:
        for word in Windower(sentence.words(), settings.n, None, None):
            text = ' '.join([x for x in ngram.toktext()])
            if settings.casesensitive: text = text.lower()
            freqlist.count(text)

    if settings.autooutput:
        if filename[-len(settings.extension) -
                    1:].lower() == '.' + settings.extension:
            outfilename = filename[:-len(settings.extension) - 1] + '.freqlist'
        else:
            outfilename += '.freqlist'
        freqlist.save(outfilename, True)

    return freqlist
Beispiel #2
0
 def append(self, sentence):
     if isinstance(sentence, str) or isinstance(sentence, unicode):
         sentence = sentence.strip().split(' ')
     self.sentences += 1
     for ngram in Windower(sentence, self.n, self.beginmarker,
                           self.endmarker):
         self.freqlistN.count(ngram)
     for ngram in Windower(sentence, self.n - 1, self.beginmarker,
                           self.endmarker):
         self.freqlistNm1.count(ngram)
Beispiel #3
0
 def test_trigrams_word(self):
     """Windower (trigrams) (on single word)"""
     global text
     result = list(iter(Windower(["hi"], 3)))
     self.assertEqual(result, [('<begin>', '<begin>', 'hi'),
                               ('<begin>', 'hi', '<end>'),
                               ('hi', '<end>', '<end>')])
Beispiel #4
0
 def test_bigrams(self):
     """Windower (bigrams)"""
     global text
     result = list(iter(Windower(text, 2)))
     self.assertEqual(result, [("<begin>", "This"), ("This", "is"),
                               ("is", "a"), ("a", "test"), ("test", "."),
                               (".", "<end>")])
 def test_freqlist_caseinsens(self):
     """Bigram Frequency List (case insensitive)"""
     global sentences
     f = FrequencyList(None, False)
     for sentence in sentences:
         f.append(Windower(sentence, 2))
     self.assertTrue((f[('is', 'a')] == 2 and f[('this', 'is')] == 1))
Beispiel #6
0
 def scoresentence(self, sentence, unknownwordprob=-12):
     score = 0
     for ngram in Windower(sentence, self.n, "<s>", "</s>"):
         try:
             score += self.logscore(ngram)
         except KeyError:
             score += unknownwordprob
     return 10**score
Beispiel #7
0
 def test_trigrams(self):
     """Windower (trigrams)"""
     global text
     result = list(iter(Windower(text, 3)))
     self.assertEqual(result, [('<begin>', '<begin>', 'This'),
                               ('<begin>', 'This', 'is'),
                               ('This', 'is', 'a'), ('is', 'a', 'test'),
                               ('a', 'test', '.'), ('test', '.', '<end>'),
                               ('.', '<end>', '<end>')])
Beispiel #8
0
def process(filename):
    try:
        print("Processing " + filename, file=sys.stderr)
        doc = folia.Document(file=filename)

        freqlist = FrequencyList()

        if settings.n == 1:
            for word in doc.words():
                text = word.toktext()
                if settings.casesensitive: text = text.lower()
                freqlist.count(text)
        elif settings.sentencemarkers:
            for sentence in doc.sentences():
                for ngram in Windower(sentence.words(), settings.n):
                    text = ' '.join([x for x in ngram.toktext()])
                    if settings.casesensitive: text = text.lower()
                    freqlist.count(text)
        else:
            for word in Windower(sentence.words(), settings.n, None, None):
                text = ' '.join([x for x in ngram.toktext()])
                if settings.casesensitive: text = text.lower()
                freqlist.count(text)

        if settings.autooutput:
            if filename[-len(settings.extension) -
                        1:].lower() == '.' + settings.extension:
                outfilename = filename[:-len(settings.extension) -
                                       1] + '.freqlist'
            else:
                outfilename += '.freqlist'
            freqlist.save(outfilename, True)
    except Exception as e:
        if settings.ignoreerrors:
            print("ERROR: An exception was raised whilst processing " +
                  filename,
                  e,
                  file=sys.stderr)
        else:
            raise

    return freqlist
Beispiel #9
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive = False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:", o, file=sys.stderr)
            sys.exit(1)

    if not files:
        print >> sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename, 'r', encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line), n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type, tuple) or isinstance(type, list):
            type = " ".join(type)
        s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
            dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
    print("Types:            ", len(freqlist), file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
    print("Entropy:          ", dist.entropy(), file=sys.stderr)
Beispiel #10
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()

        l = self.settings['leftcontext']
        r = self.settings['rightcontext']
        n = l + 1 + r

        self.log("Generating training instances...")
        fileprefix = modelfile.replace(".ibase",
                                       "")  #has been verified earlier
        classifier = TimblClassifier(fileprefix, self.gettimbloptions())
        if sourcefile.endswith(".bz2"):
            iomodule = bz2
        elif sourcefile.endswith(".gz"):
            iomodule = gzip
        else:
            iomodule = io
        with iomodule.open(sourcefile,
                           mode='rt',
                           encoding='utf-8',
                           errors='ignore') as f:
            for i, line in enumerate(f):
                if i % 100000 == 0:
                    print(
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        " - " + str(i),
                        file=sys.stderr)
                for ngram in Windower(line, n):
                    confusible = ngram[l]
                    if confusible in self.settings['confusibles']:
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l + 1:])
                        classifier.append(leftcontext + rightcontext,
                                          confusible)

        self.log("Training classifier...")
        classifier.train()

        self.log("Saving model " + modelfile)
        classifier.save()
Beispiel #11
0
 def test_unigrams(self):
     """Windower (unigrams)"""
     global text
     result = list(iter(Windower(text, 1)))
     self.assertEqual(result, [("This", ), ("is", ), ("a", ), ("test", ),
                               (".", )])
Beispiel #12
0
    def train(self, sourcefile, modelfile, **parameters):
        if modelfile == self.confusiblefile:
            #Build frequency list
            self.log(
                "Preparing to generate lexicon for suffix confusible module")
            classfile = stripsourceextensions(sourcefile) + ".cls"
            corpusfile = stripsourceextensions(sourcefile) + ".dat"

            if not os.path.exists(classfile):

                self.log("Building class file")
                classencoder = colibricore.ClassEncoder(
                    "", self.settings['minlength'],
                    self.settings['maxlength'])  #character length constraints
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(
                    classfile, self.settings['minlength'],
                    self.settings['maxlength'])

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile(sourcefile, corpusfile)

            self.log("Generating frequency list")
            options = colibricore.PatternModelOptions(
                mintokens=self.settings['freqthreshold'],
                minlength=1,
                maxlength=1)  #unigrams only
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Finding confusible pairs")
            classdecoder = colibricore.ClassDecoder(classfile)
            self.confusibles = []  #pylint: disable=attribute-defined-outside-init
            for pattern in model:
                try:
                    pattern_s = pattern.tostring(classdecoder)
                except UnicodeDecodeError:
                    self.log(
                        "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!"
                    )
                for suffix in self.suffixes:
                    if pattern_s.endswith(
                            suffix) and not pattern_s in self.confusibles:
                        found = []
                        for othersuffix in self.suffixes:
                            if othersuffix != suffix:
                                otherpattern_s = pattern_s[:-len(
                                    suffix)] + othersuffix
                                try:
                                    otherpattern = classencoder.buildpattern(
                                        otherpattern_s, False, False)
                                except KeyError:
                                    if found: found = []
                                    break
                                if not otherpattern in model:
                                    if found: found = []
                                    break
                                if self.settings['maxratio'] != 0:
                                    freqs = (
                                        model.occurrencecount(pattern),
                                        model.occurrencecount(otherpattern))
                                    ratio = max(freqs) / min(freqs)
                                    if ratio < self.settings['maxratio']:
                                        if found: found = []
                                        break
                                found.append(otherpattern_s)
                        if found:
                            self.confusibles.append(pattern_s)
                            for s in found:
                                self.confusibles.append(s)

            self.log("Writing confusible list")
            with open(modelfile, 'w', encoding='utf-8') as f:
                for confusible in self.confusibles:
                    f.write(confusible + "\n")

        elif modelfile == self.modelfile:
            try:
                self.confusibles
            except AttributeError:
                self.confusibles = []
                self.log("Loading confusiblefile")
                with open(self.confusiblefile, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            self.confusibles.append(line)

            if self.hapaxer:
                self.log("Training hapaxer...")
                self.hapaxer.train()

            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase",
                                           "")  #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,
                               mode='rt',
                               encoding='utf-8',
                               errors='ignore') as f:
                for i, line in enumerate(f):
                    for ngram in Windower(line, n):
                        if i % 100000 == 0:
                            print(datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") + " - " + str(i),
                                  file=sys.stderr)
                        confusible = ngram[l]
                        if confusible in self.confusibles:
                            if self.hapaxer:
                                ngram = self.hapaxer(ngram)
                            leftcontext = tuple(ngram[:l])
                            rightcontext = tuple(ngram[l + 1:])
                            suffix, normalized = self.getsuffix(confusible)
                            if suffix is not None:
                                classifier.append(
                                    leftcontext + (normalized, ) +
                                    rightcontext, suffix)

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
Beispiel #13
0
    def run(self, inputdata):
        """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client"""
        words = [word_text for word_id, word_text in inputdata]  #pylint: disable=unused-variable
        word_ids = [word_id for word_id, word_text in inputdata]  #pylint: disable=unused-variable

        actions = [None] * len(
            words
        )  #array of actions to be taken for each token, actions are (None,freq) for deletions or (punct,freq) for insertions

        #find possible deletions
        for i, trigram in enumerate(Windower(words, 3)):
            if trigram[0] != "<begin>" and trigram[-1] != "<end>":
                if trigram[1] in self.PUNCTUATION and trigram[
                        0] not in self.PUNCTUATION and trigram[
                            -1] not in self.PUNCTUATION:
                    #trigram pattern (X p Y) focussing on a punctuation token
                    trigram_pattern = self.classencoder.buildpattern(
                        " ".join(trigram))
                    trigram_oc = self.trigram_model.occurrencecount(
                        trigram_pattern)
                    if trigram_oc >= self.settings['deletioncutoff']:
                        if self.debug:
                            self.log(
                                " (Trigram '" + " ".join(trigram) +
                                "' too frequent to consider for deletion (" +
                                str(trigram_oc) + ")")
                    else:
                        #bigram version without the punctuation token
                        if trigram[1] in self.EOSMARKERS and trigram[-1].isalpha(
                        ) and trigram[-1][0] == trigram[-1][0].upper(
                        ):  #deletion candidate is an eos marker, remove casing
                            bigram = (trigram[0], trigram[-1].lower())
                        else:
                            bigram = (trigram[0], trigram[-1])
                        bigram_pattern = self.classencoder.buildpattern(
                            " ".join(bigram))
                        if not bigram_pattern.unknown():
                            #get occurrences
                            bigram_oc = self.bigram_model.occurrencecount(
                                bigram_pattern)
                            if bigram_oc >= self.settings['deletionthreshold']:
                                #bigram (X Y) is prevalent enough to warrant as a deletion solution
                                if self.debug:
                                    self.log(
                                        " (Bigram candidate without punctuation prevalent enough to warrant as a deletion solution: '"
                                        + " ".join(bigram) + "')")

                                #but first check if bigrams X p and p Y don't reach the cut-off threshold
                                bigram_trailpunct = trigram_pattern[0:2]
                                if self.bigram_model.occurrencecount(
                                        bigram_trailpunct
                                ) >= self.settings['deletioncutoff']:
                                    if self.debug:
                                        self.log(
                                            " (Bigram with trailing punctuation exceeds cut-off threshold, no deletion)"
                                        )
                                else:
                                    bigram_initialpunct = trigram_pattern[1:3]
                                    if self.bigram_model.occurrencecount(
                                            bigram_initialpunct
                                    ) >= self.settings['deletioncutoff']:
                                        if self.debug:
                                            self.log(
                                                " (Bigram with initial punctuation does not reach cut-off threshold, no deletion)"
                                            )
                                    else:
                                        if self.debug:
                                            self.log(
                                                " (Punctuation deletion candidate: "
                                                + " ".join(bigram) + " (" +
                                                str(bigram_oc) + ") vs " +
                                                " ".join(trigram) + " (" +
                                                str(trigram_oc) + ")")
                                        actions[i - 1] = ('delete', trigram[1],
                                                          bigram_oc)

            if i > 0 and len(actions) > i - 1 and actions[i - 1] is None:
                #Recasing
                #given a bigram x y       (from trigram x y z)
                #check if x Y is more frequent than x y
                recase = False
                bigram_left = trigram[:-1]
                firstchar = bigram_left[-1][0]
                if firstchar.isalpha():
                    if firstchar == firstchar.lower():
                        firstchar = firstchar.upper()
                    else:
                        firstchar = firstchar.lower()

                    word = bigram_left[1]
                    word_recased = firstchar + bigram_left[1][1:]
                    word_pattern = self.classencoder.buildpattern(word)
                    word_pattern_recased = self.classencoder.buildpattern(
                        word_recased)
                    if not word_pattern_recased.unknown():
                        word_pattern_recased_oc = self.unigram_model.occurrencecount(
                            word_pattern_recased)
                        if word_pattern_recased_oc >= self.settings[
                                'recasethreshold']:
                            word_pattern_oc = self.unigram_model.occurrencecount(
                                word_pattern)
                            if word_pattern_recased_oc >= word_pattern_oc * self.settings[
                                    'recasefactor'] or (
                                        word_pattern_oc == 0
                                        and word_pattern_recased_oc >=
                                        self.settings['recasefactor']):
                                #contextless approach
                                recase = True

                            if not recase:
                                #context-based approach
                                if bigram_left[0] == "<begin>":
                                    #first word
                                    if word_pattern_recased_oc >= word_pattern_oc and firstchar == firstchar.upper(
                                    ):
                                        recase = True
                                else:
                                    bigram_left_recased = (bigram_left[0],
                                                           firstchar +
                                                           bigram_left[1][1:])
                                    bigram_left_recased_pattern = self.classencoder.buildpattern(
                                        " ".join(bigram_left_recased))
                                    if not bigram_left_recased_pattern.unknown(
                                    ):
                                        #if self.debug >= 3: self.log(" (Considering recasing " + bigram_left[1] + " -> " + bigram_left_recased[1] + ")")
                                        bigram_left_recased_oc = self.bigram_model.occurrencecount(
                                            bigram_left_recased_pattern)
                                        bigram_left_pattern = self.classencoder.buildpattern(
                                            " ".join(bigram_left))
                                        bigram_left_oc = self.bigram_model.occurrencecount(
                                            bigram_left_pattern)
                                        if bigram_left_recased_oc >= self.settings[
                                                'recasethreshold2'] and bigram_left_recased_oc > self.bigram_model.occurrencecount(
                                                    self.classencoder.
                                                    buildpattern(" ".join(
                                                        bigram_left))):
                                            if self.debug:
                                                self.log(
                                                    " (left bigram suggests recasing '"
                                                    + " ".join(bigram_left) +
                                                    "' (" +
                                                    str(bigram_left_oc) +
                                                    ") -> '" + " ".join(
                                                        bigram_left_recased) +
                                                    "' (" +
                                                    str(bigram_left_recased_oc)
                                                    + ")")
                                            recase = True

                                            #bigram_right = trigram[1:]
                                            #bigram_right_pattern = self.classencoder.buildpattern(" ".join(bigram_right))
                                            #bigram_right_recased = (firstchar + bigram_right[0][1:], bigram_right[1])
                                            #bigram_right_recased_pattern = self.classencoder.buildpattern(" ".join(bigram_right_recased))
                                            #bigram_right_oc = self.bigram_model.occurrencecount(bigram_right_pattern)
                                            #if not bigram_right_recased_pattern.unknown():
                                            #    bigram_right_recased_oc =  self.bigram_model.occurrencecount(bigram_right_recased_pattern)
                                            #    if bigram_right_oc == 0 or bigram_right_recased_oc > bigram_right_oc:
                                            #        #checks pass, recase:
                                            #        recase = True
                                            #    else:
                                            #        if self.debug: self.log(" (right bigram refutes recasing '" + " ".join(bigram_right) + "' (" + str(bigram_right_oc) + ") -> '" + " ".join(bigram_right_recased) +  "' (" + str(bigram_right_recased_oc) + ")")
                                            #elif bigram_right_oc == 0:
                                            #    recase = True
                                            #else:
                                            #    if self.debug: self.log(" (right bigram refutes recasing '" + " ".join(bigram_right) + "' (" + str(bigram_right_oc) + ") -> '" + " ".join(bigram_right_recased) +  "' (not found)")

                            if recase:
                                if self.debug:
                                    self.log(" (Recasing: '" + word +
                                             "' -> '" + word_recased +
                                             "' in " + " ".join(trigram))
                                actions[i - 1] = ('recase', word_recased, 1)

        #find possible insertions
        for i, bigram in enumerate(Windower(words, 2, None, None)):
            if bigram[0] not in self.PUNCTUATION and bigram[
                    1] not in self.PUNCTUATION:
                bigram_pattern = self.classencoder.buildpattern(
                    " ".join(bigram))
                bigram_oc = self.bigram_model.occurrencecount(bigram_pattern)
                if bigram_oc >= self.settings['insertioncutoff']:
                    continue  #bigram too prevalent to consider for insertion

                for punct in self.PUNCTUATION:
                    if punct in self.EOSMARKERS and bigram[-1].isalpha(
                    ) and bigram[-1][0] == bigram[-1][0].lower():
                        trigram = (
                            bigram[0], punct,
                            bigram[-1][0].upper() + bigram[-1][1:]
                        )  #insertion candidate is an eos marker, do recasing to initial capital
                    else:
                        trigram = (bigram[0], punct, bigram[-1])
                    trigram_pattern = self.classencoder.buildpattern(
                        " ".join(trigram))
                    if trigram_pattern.unknown():
                        continue

                    trigram_oc = self.trigram_model.occurrencecount(
                        trigram_pattern)
                    if trigram_oc >= bigram_oc and trigram_oc >= self.settings[
                            'insertionthreshold']:
                        if self.debug:
                            self.log(" (Punctuation insertion candidate: " +
                                     " ".join(trigram) + " (" +
                                     str(trigram_oc) + ") vs " +
                                     " ".join(bigram) + " (" + str(bigram_oc) +
                                     ")")
                        actions[i] = ('insert', punct, trigram_oc)

        #Consolidate all the actions through a simple survival of the fittest mechanism
        #making sure no adjacent deletions/insertion occur
        recaseactions = [None] * len(words)
        for i, (prevaction, action) in enumerate(Windower(actions, 2)):
            i = i - 1
            if action is not None and action[0] != 'recase':
                if prevaction is not None and prevaction != "<begin>" and prevaction[
                        0] != 'recase':
                    if self.debug:
                        self.log(
                            "(Consolidating punc/recase actions, removing conflict)"
                        )
                    if action[2] > prevaction[2]:  #highest frequency wins
                        actions[i - 1] = None
                    else:
                        actions[i] = None

        #Add recasing actions after insertion/deletion of EOS markers
        for i, action in enumerate(actions):
            if action is not None:
                if action[
                        1] in self.EOSMARKERS:  #Do we have have action on an EOS marker?
                    if action[0] == 'insert':  #Is it an insertion?
                        if len(words) > i + 1 and words[i + 1].isalpha(
                        ) and words[i + 1] == words[
                                i + 1].lower():  #Is the next word lowercase?
                            if self.debug:
                                self.log(" (Recasing after EOS insertion)")
                            recaseactions[i +
                                          1] = words[i + 1][0].upper() + words[
                                              i + 1][1:]  #yes, recase it
                    elif action[0] == 'delete':  #Is it an deletion?
                        if len(words) > i + 1 and words[i + 1].isalpha(
                        ) and words[i + 1][0] == words[i + 1][0].lower(
                        ):  #Does the next word start with a capital?
                            if self.debug:
                                self.log(" (Recasing after EOS deletion)")
                            recaseactions[i + 1] = words[
                                i + 1].lower()  #yes, lowercase it

        for i, recaseaction in enumerate(recaseactions):
            if recaseaction is not None:
                actions[i] = ('recase', recaseaction, 1)

        if self.settings['enforcefinalperiod']:
            #enforce final period
            if words[-1] not in self.EOSMARKERS and actions[-1] is None:
                if self.debug: self.log(" (Enforcing final period)")
                actions[-1] = ('insert', '.', 1)

        #                    action, punc
        return [(word_id, (action[0], action[1]))
                for word_id, action in zip(word_ids, actions)
                if action is not None]
Beispiel #14
0
    def train(self, sourcefile, modelfile, **parameters):
        if self.hapaxer:
            self.log("Training hapaxer...")
            self.hapaxer.train()
        if modelfile.endswith('.ibase'):
            l = self.settings['leftcontext']
            r = self.settings['rightcontext']
            n = l + 1 + r

            self.log("Generating training instances...")
            fileprefix = modelfile.replace(".ibase","") #has been verified earlier
            classifier = TimblClassifier(fileprefix, self.gettimbloptions())
            if sourcefile.endswith(".bz2"):
                iomodule = bz2
            elif sourcefile.endswith(".gz"):
                iomodule = gzip
            else:
                iomodule = io
            with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr)
                    for ngram in Windower(line, n):
                        if self.hapaxer:
                            ngram = self.hapaxer(ngram)
                        focus = ngram[l]
                        if self.hapaxer and focus == self.hapaxer.placeholder:
                            continue
                        leftcontext = tuple(ngram[:l])
                        rightcontext = tuple(ngram[l+1:])
                        classifier.append( leftcontext + rightcontext , focus )

            self.log("Training classifier...")
            classifier.train()

            self.log("Saving model " + modelfile)
            classifier.save()
        elif modelfile.endswith('.patternmodel'):
            self.log("Preparing to generate lexicon for Language Model")
            classfile = stripsourceextensions(sourcefile) +  ".cls"
            corpusfile = stripsourceextensions(sourcefile) +  ".dat"

            if not os.path.exists(classfile):
                self.log("Building class file")
                classencoder = colibricore.ClassEncoder()
                classencoder.build(sourcefile)
                classencoder.save(classfile)
            else:
                classencoder = colibricore.ClassEncoder(classfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            if not os.path.exists(corpusfile):
                self.log("Encoding corpus")
                classencoder.encodefile( sourcefile, corpusfile)

            if not os.path.exists(modelfile+'.cls'):
                #make symlink to class file, using model name instead of source name
                os.symlink(classfile, modelfile + '.cls')

            self.log("Generating pattern model")
            options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1)
            model = colibricore.UnindexedPatternModel()
            model.train(corpusfile, options)

            self.log("Saving model " + modelfile)
            model.write(modelfile)
Beispiel #15
0
def main():
    dopretests = True
    try:
        tests = sys.argv[1]
        if tests[0] == 'x':
            dopretests = False
            tests = tests[1:]
        if '-' in tests:
            begintest = int(tests.split('-')[0])
            endtest = int(tests.split('-')[1])
        else:
            begintest = endtest = int(tests)
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)
    try:
        textfile = sys.argv[2]
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)

    try:
        tmpdir = sys.argv[3]
    except:
        tmpdir = "/tmp/"

    classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls'
    datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat'
    modelfile = tmpdir + "/" + os.path.basename(
        textfile) + '.colibri.patternmodel'

    if not os.path.exists(textfile):
        print("File does not exist", file=sys.stderr)
        sys.exit(2)

    if dopretests:
        linecount = 0
        print("PRETEST #1 - Reading text file (Python)")
        b = begin()
        with open(textfile, 'r', encoding='utf-8') as f:
            for line in f:
                linecount += 1
        end(b)
        print("\t(Read " + str(linecount) + " lines)")

        print("PRETEST #2 - Building class encoder")
        encoder = colibricore.ClassEncoder()
        b = begin()
        encoder.build(textfile)
        end(b)

        print("PRETEST #3 - Saving class encoder")
        b = begin()
        encoder.save(classfile)
        end(b)

        print("PRETEST #4 - Class encoding corpus")
        b = begin()
        encoder.encodefile(textfile, datafile)
        end(b)

        print("PRETEST #5 - Unloading encoder")
        b = begin()
        del encoder
        gc.collect()
        end(b)

    if begintest < endtest:
        print("Running tests ", begintest, " to ", endtest)
        for testnum in range(begintest, min(endtest + 1, 10)):
            os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " +
                      textfile + " " + tmpdir)

    else:
        testnum = begintest
        print("-------------------- " + colorf('bold', 'TEST') + " #" +
              str(testnum) + " ----------------------")
        if testnum == 1:

            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")

        elif testnum == 2:
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)"
            )

            from nltk.probability import FreqDist
            from nltk.util import ngrams

            fd = FreqDist()
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split(' ')
                    for n in range(1, 9):
                        for ngram in ngrams(tokens, n):
                            fd[ngram] += 1
            end(b)
            print("\t(Done)")
        elif testnum == 3:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        if testnum == 4:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            for n in range(1, 9):
                with open(textfile, 'r', encoding='utf-8') as f:
                    for line in f:
                        for ngram in Windower(line, n):
                            docount = True
                            if n > 1:
                                for subngram in Windower(ngram, n - 1):
                                    if not subngram in ngrams:
                                        docount = False
                                        break
                            if docount:
                                ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        if testnum == 5:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            for ngram in list(ngrams.keys()):
                if ngrams[ngram] < 2: del ngrams[ngram]
            gc.collect()
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        elif testnum == 6:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 7:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.UnindexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 8:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

            del model

        elif testnum == 9:
            print(
                "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 10:

            print(
                "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2,
                                                      maxlength=8,
                                                      doskipgrams=True)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 11:
            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel"
            )
            model = colibricore.OrderedUnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        else:
            print("No such test", file=sys.stderr)
        print()
Beispiel #16
0
 def scoresentence(self, sentence):
     return product([
         self[x] for x in Windower(sentence, self.n, self.beginmarker,
                                   self.endmarker)
     ])
Beispiel #17
0
    elif o == "-e":
        encoding = a
    else:
        print("ERROR: Unknown option:", o, file=sys.stderr)
        sys.exit(1)

if not files:
    print >> sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename, 'r', encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line), n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type, tuple) or isinstance(type, list):
        type = " ".join(type)
    s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
        dist.information(type))
    print(s)

print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
print("Types:            ", len(freqlist), file=sys.stderr)
Beispiel #18
0
#Presuming that each token will be on one line, make a mapping from lines to IDs
idmap = [w.id for w in doc.words()]

########## Extract data for modules ##############

if not standalone:
    clam.common.status.write(statusfile, "Extracting data for modules", 3)

f = open(outputdir + 'input.tok.txt', 'w')
for currentword in doc.words():
    f.write(str(currentword).replace('’', '\'') + ' ')
f.close()

f = open(outputdir + 'agreement_checker.test.inst', 'w')
for prevword3, prevword2, prevword, currentword, nextword, nextword2, nextword3 in Windower(
        doc.words(), 7):
    f.write(
        str(prevword3) + ' ' + str(prevword2) + ' ' + str(prevword) + ' ' +
        str(currentword) + ' ' + str(nextword) + ' ' + str(nextword2) + ' ' +
        str(nextword3) + ' ' + str(currentword) + '\n')
f.close()

###### BEGIN CALL MODULES (USING PARALLEL POOL) ######
# (nothing to edit here)

errout("Calling modules")
if not standalone:
    clam.common.status.write(statusfile, "Calling Modules", 4)


def processor():