Beispiel #1
0
def processFile(filename, fp):
    #print(filename)
    try:
        ifp = open(filename, mode='r', encoding='utf8')
        text = ifp.read()
        ifp.close()
    except IOError:
        print("Sorry, cannot read from file", filename)
    ngrams.makeNgramModel(tokenize(text), 1, fp)
Beispiel #2
0
#!/usr/bin/env python3

"""
(C) 2013 by Damir Cavar

Reading remot texts from URL.
"""


import urllib.request, ngrams
from operator import itemgetter


response = urllib.request.urlopen("http://www.gutenberg.org/cache/epub/873/pg873.txt")
text = response.read()

text = str(text).replace("\\r\\n", "\n")

myitems = ngrams.makeNgramModel(text.split(), 3).items()
# myitems = sorted(myitems, key=lambda element: element[1], reverse=True)
myitems = sorted(myitems, key=itemgetter(1), reverse=False)
for key, value in myitems:
   print(key, value)




Beispiel #3
0
N-gram model from text and generation of DOT-representation (Graphviz).
"""


from ngrams import makeNgramModel
from operator import itemgetter
import urllib.request, re

req = urllib.request.Request('http://www.gutenberg.org/cache/epub/873/pg873.txt')
response = urllib.request.urlopen(req)
the_page = response.read()
# convert the bytearray to a string object
the_page = the_page.decode('utf-8')
the_page = re.sub("[\"\';:,.<>?!()-_*+=\/\[\]\{\}]", " ", the_page)

fp = makeNgramModel(the_page.lower().split(), 2)

tuplelist = fp.items()

print("digraph X {")
for token, frq in sorted(tuplelist, key=itemgetter(1), reverse=True):
   tokens = token.split()
   if len(tokens[0]) == 0:
      continue
   if len(tokens[1]) == 0:
      continue
   print(tokens[0] + ' -> ' + tokens[1] + ' ;')
print("}")

Beispiel #4
0
Make sure that you paste some German and English text for
the extraction of language models in text-de-1.txt and
text-en-1.txt. The more text you put in these files,
the better it should be for your model generation.
"""

# https://github.com/dcavar/Py3L

from ngrams import makeNgramModel

# read file
ifp = open("text-de-1.txt", mode='r', encoding='utf8')
mytext = ifp.read()
ifp.close()

mymodel_de = makeNgramModel(mytext, 3)

total = sum(mymodel_de.values())

for key, value in mymodel_de.items():
    mymodel_de[key] = value / total
#    print(key, mymodel_de[key])


# read file
ifp = open("text-en-1.txt", mode='r', encoding='utf8')
mytext = ifp.read()
ifp.close()

mymodel_en = makeNgramModel(mytext, 3)
Beispiel #5
0
Make sure that you paste some German and English text for
the extraction of language models in text-de-1.txt and
text-en-1.txt. The more text you put in these files,
the better it should be for your model generation.
"""

# https://github.com/dcavar/Py3L

from ngrams import makeNgramModel

# read file
ifp = open("text-de-1.txt", mode='r', encoding='utf8')
mytext = ifp.read()
ifp.close()

mymodel_de = makeNgramModel(mytext, 3)

total = sum(mymodel_de.values())

for key, value in mymodel_de.items():
    mymodel_de[key] = value / total
#    print(key, mymodel_de[key])

# read file
ifp = open("text-en-1.txt", mode='r', encoding='utf8')
mytext = ifp.read()
ifp.close()

mymodel_en = makeNgramModel(mytext, 3)

total = sum(mymodel_en.values())
Beispiel #6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
(C) 2013 by Damir Cavar

Reading remot texts from URL.
"""

import urllib.request, ngrams
from operator import itemgetter

response = urllib.request.urlopen(
    "http://www.gutenberg.org/cache/epub/873/pg873.txt")
text = response.read()

text = str(text).replace("\\r\\n", "\n")

myitems = ngrams.makeNgramModel(text.split(), 3).items()
# myitems = sorted(myitems, key=lambda element: element[1], reverse=True)
myitems = sorted(myitems, key=itemgetter(1), reverse=False)
for key, value in myitems:
    print(key, value)
Beispiel #7
0
N-gram model from text and generation of DOT-representation (Graphviz).
"""


from ngrams import makeNgramModel
from operator import itemgetter
import urllib.request, re

req = urllib.request.Request('http://www.gutenberg.org/cache/epub/873/pg873.txt')
response = urllib.request.urlopen(req)
the_page = response.read()
# convert the bytearray to a string object
the_page = the_page.decode('utf-8')
the_page = re.sub("[\"\';:,.<>?!()-_*+=\/\[\]\{\}]", " ", the_page)

fp = makeNgramModel(the_page.lower().split(), 2)

tuplelist = fp.items()

print("digraph X {")
for token, frq in sorted(tuplelist, key=itemgetter(1), reverse=True):
   tokens = token.split()
   if len(tokens[0]) == 0:
      continue
   if len(tokens[1]) == 0:
      continue
   print(tokens[0] + ' -> ' + tokens[1] + ' ;')
print("}")