def processFile(filename, fp): #print(filename) try: ifp = open(filename, mode='r', encoding='utf8') text = ifp.read() ifp.close() except IOError: print("Sorry, cannot read from file", filename) ngrams.makeNgramModel(tokenize(text), 1, fp)
#!/usr/bin/env python3 """ (C) 2013 by Damir Cavar Reading remot texts from URL. """ import urllib.request, ngrams from operator import itemgetter response = urllib.request.urlopen("http://www.gutenberg.org/cache/epub/873/pg873.txt") text = response.read() text = str(text).replace("\\r\\n", "\n") myitems = ngrams.makeNgramModel(text.split(), 3).items() # myitems = sorted(myitems, key=lambda element: element[1], reverse=True) myitems = sorted(myitems, key=itemgetter(1), reverse=False) for key, value in myitems: print(key, value)
N-gram model from text and generation of DOT-representation (Graphviz). """ from ngrams import makeNgramModel from operator import itemgetter import urllib.request, re req = urllib.request.Request('http://www.gutenberg.org/cache/epub/873/pg873.txt') response = urllib.request.urlopen(req) the_page = response.read() # convert the bytearray to a string object the_page = the_page.decode('utf-8') the_page = re.sub("[\"\';:,.<>?!()-_*+=\/\[\]\{\}]", " ", the_page) fp = makeNgramModel(the_page.lower().split(), 2) tuplelist = fp.items() print("digraph X {") for token, frq in sorted(tuplelist, key=itemgetter(1), reverse=True): tokens = token.split() if len(tokens[0]) == 0: continue if len(tokens[1]) == 0: continue print(tokens[0] + ' -> ' + tokens[1] + ' ;') print("}")
Make sure that you paste some German and English text for the extraction of language models in text-de-1.txt and text-en-1.txt. The more text you put in these files, the better it should be for your model generation. """ # https://github.com/dcavar/Py3L from ngrams import makeNgramModel # read file ifp = open("text-de-1.txt", mode='r', encoding='utf8') mytext = ifp.read() ifp.close() mymodel_de = makeNgramModel(mytext, 3) total = sum(mymodel_de.values()) for key, value in mymodel_de.items(): mymodel_de[key] = value / total # print(key, mymodel_de[key]) # read file ifp = open("text-en-1.txt", mode='r', encoding='utf8') mytext = ifp.read() ifp.close() mymodel_en = makeNgramModel(mytext, 3)
Make sure that you paste some German and English text for the extraction of language models in text-de-1.txt and text-en-1.txt. The more text you put in these files, the better it should be for your model generation. """ # https://github.com/dcavar/Py3L from ngrams import makeNgramModel # read file ifp = open("text-de-1.txt", mode='r', encoding='utf8') mytext = ifp.read() ifp.close() mymodel_de = makeNgramModel(mytext, 3) total = sum(mymodel_de.values()) for key, value in mymodel_de.items(): mymodel_de[key] = value / total # print(key, mymodel_de[key]) # read file ifp = open("text-en-1.txt", mode='r', encoding='utf8') mytext = ifp.read() ifp.close() mymodel_en = makeNgramModel(mytext, 3) total = sum(mymodel_en.values())
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ (C) 2013 by Damir Cavar Reading remot texts from URL. """ import urllib.request, ngrams from operator import itemgetter response = urllib.request.urlopen( "http://www.gutenberg.org/cache/epub/873/pg873.txt") text = response.read() text = str(text).replace("\\r\\n", "\n") myitems = ngrams.makeNgramModel(text.split(), 3).items() # myitems = sorted(myitems, key=lambda element: element[1], reverse=True) myitems = sorted(myitems, key=itemgetter(1), reverse=False) for key, value in myitems: print(key, value)