Beispiel #1
0
    for c in range(len(tokens) - 3):
        if (tokens[c][1] == TYPE_WORD and
            tokens[c + 1][0] == "-" and
            tokens[c + 2][0] == "\n" and
            (tokens[c + 3][1] == TYPE_WORD or tokens[c + 3][1] == TYPE_PAGEMARKER)):
            word = tokens[c][0] + tokens[c + 1][0]
            if tokens[c + 3][1] == TYPE_PAGEMARKER:
                if len(tokens) < c + 4: continue
                word += tokens[c + 4][0]
            else:
                word += tokens[c + 3][0]
            if word == soft_hyphen_dict[word]:
                tokens[c + 1][0] += u"%"
            else:
                tokens[c + 1][0] = u"%"
    return tokens


text = unicode(sys.stdin.read(), "utf-8")


soft_hyphen_dict = {}
for line in file(sys.argv[1]):
    fields = unicode(line, "utf-8").split()
    soft_hyphen_dict[fields[0]] = fields[2]

tokens = language_specials(tokenise(text, True))
tokens = process_hyphens(soft_hyphen_dict, tokens)
for t in tokens:
    sys.stdout.write(t[0].encode("utf-8"))
Beispiel #2
0
if len(sys.argv) < 4 or not os.access(sys.argv[1], os.R_OK):
    print "Usage: %s project_file user [old_file]" % sys.argv[0]
    sys.exit(-1)
# get a list of relevant pages
pd = project_data.ProjectData(sys.argv[1])
pages = pd.get_pages(sys.argv[2])
if len(pages) == 0:
    print "No pages for user " + sys.argv[2]
    sys.exit(-2)
# build the pages into a single block of text
text = u""
for p in pages:
    text += unicode(pd.get_text(p[0], sys.argv[2])[project_data.DATA], "utf-8")
    text += u"\n"
# tokenise the text
tokens = tokenise(text)
# make apostrophes and hyphens part of words
for c in range(len(tokens) - 2):
    if (
        tokens[c][1] == TYPE_WORD
        and (tokens[c + 1][0] == "'" or tokens[c + 1][0] == "-")
        and tokens[c + 2][1] == TYPE_WORD
    ):
        tokens[c + 2] = [tokens[c][0] + tokens[c + 1][0] + tokens[c + 2][0], TYPE_WORD]
        tokens[c] = tokens[c + 1] = ["", TYPE_UNKNOWN]
tokens = [X for X in tokens if X[1] != TYPE_UNKNOWN]
# split out eol hyphenated words
eol_hyphenated = set()
for c in range(len(tokens) - 3):
    if (
        tokens[c][1] == TYPE_WORD
Beispiel #3
0
from pyspark.mllib.fpm import FPGrowth
from operator import itemgetter
from  tokenise import *

from pyspark import SparkContext
sc = SparkContext('local', 'Exam_3')
data = open("output.txt","r").read()
data=data.split("\n@@@\n")
data2=[]
f=open("newoutput.txt","w").write("")
for i in data:
	for j in set(tokenise(i)):
		if j!='"rt':
			with open("newoutput.txt","a") as f:
				f.write(j+" ")
	with open("newoutput.txt","a") as f:
				f.write("\n")

data=sc.textFile("newoutput.txt")
#print data2
print "starting"
transactions = data.map(lambda line: line.strip().split(' '))

model = FPGrowth.train(transactions, minSupport=0.001, numPartitions=10)
result = sorted(model.freqItemsets().collect(),key=lambda x: len(x[0]), reverse=True)
for fi in result:
    print(fi)


import operator


with open("small.txt") as f:
	data=f.read()
print "haiyya"

data=data.split("@@@")
tf=[]
idf={}
N=1
tweetsize=[]
for tweet in  data:
	print "tweet"+str(N)
	N+=1
	te=tokenise(tweet)
	tweetsize.append(len(te))
	dic={}
	for i in te:
		try:
			dic[i]+=1
		except KeyError:
			dic[i]=1
	tf.append(dic)
	for key in dic:
		try:
			idf[key]+=1
		except KeyError:
			idf[key]=1
for i in xrange(len(tf)):
	for j in tf[i]:
Beispiel #5
0
    print "Usage: %s my_file their_file soft_hyphens_file"
    sys.exit(-1)

soft_hyphen_dict = {}
for line in file(sys.argv[3]):
    fields = unicode(line, "utf-8").split()
    soft_hyphen_dict[fields[0]] = fields[2]

sm = difflib.SequenceMatcher()


chapters_1 = split_chapters(unicode(file(sys.argv[1]).read(), "utf-8"))
chapters_2 = split_chapters(unicode(file(sys.argv[2]).read(), "utf-8"))
chapter_str = []
for ch in range(20):
    seq1 = make_sequence(remove_soft_hyphens(soft_hyphen_dict, language_specials(tokenise(chapters_1[ch]))))
    seq2 = make_sequence(join_hyphens(language_specials(tokenise(chapters_2[ch]))))

    sm.set_seqs(seq1, seq2)

    outstr = u"<p>"
    matches = sm.get_matching_blocks()
    for c, m in enumerate(matches):
        if m[2] == 0: break #m[2] is count of matching paras
        next_match = matches[c + 1]
        outstr += "".join(seq1[m[0]:m[0] + m[2]]).replace("\n", "</p><p>")
        diff1 = seq1[m[0] + m[2]: next_match[0]]
        for t in diff1:
            if t == "\n": t = "\\n"
            outstr += "<span class='diff1'>%s</span>" % t
        diff2 = seq2[m[1] + m[2]: next_match[1]]