Ejemplo n.º 1
0
def main(arg="iamtoocoolforthis"):

    s = clean(arg)
    print "CLEANED STRING:", s
    print "======================RUNNING OPTIMIZED==================="
    print segment_method1(s)
    print "======================RUNNING VANILLA==================="
    print segment(s)
Ejemplo n.º 2
0
def precisioncalc(query):
	print query,
	k = searchgoogle(query)
	seg = segment(query)
	m = []
	for n in seg:
		m.append(stemming.porter2.stem(n))
	seg = " ".join(m)
	if socialListProxy:
		proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
		opener = ulib.build_opener(proxy)
		ulib.install_opener(opener)
	counter = 0
	total = 0
	for i in xrange(len(k)):
		req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
		k[i] = segment(k[i])
		l = []
		for j in k[i]:
			l.append(stemming.porter2.stem(j))
		k[i] = " ".join(k[i])
		# print k[i]
		try:
			content = ulib.urlopen(req)
			x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
			t = []
			for s in x:
				t.append(stemming.porter2.stem(s))
			t = " ".join(t)
			# print t
			if ((seg in k[i]) or (seg in t)):
				counter = counter + 1
			total = total + 1
		except:
			pass

		if (total == 10):
			print str(counter)+"/"+str(total),
		if (total == 20):
			print str(counter)+"/"+str(total),


	if total < 10:
		print str(counter)+"/"+str(10), str(counter)+"/"+str(20)
	elif total < 20:
		print str(counter)+"/"+str(20)
	else:
		print ""
#precisioncalc("madhusai") #uncomment this to check the presion of some word
Ejemplo n.º 3
0
def test_segment_10():
    result = [
        'as', 'gregor', 'samsa', 'awoke', 'one', 'morning', 'from', 'uneasy',
        'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed',
        'into', 'a', 'gigantic', 'insect'
    ]
    assert segment(''.join(result)) == result
Ejemplo n.º 4
0
def test_segment_12():
    result = [
        'far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the',
        'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of',
        'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun'
    ]
    assert segment(''.join(result)) == result
Ejemplo n.º 5
0
def create_dict():
	relation_name=[x[2] for x in os.walk("nell/relations")][0]
	sub_table={}
	obj_table={}
	for r in relation_name:
		lst=[]
		r_name=' '.join(segment(r.split(':')[1]))
		print r_name
		with open("nell/relations/"+r) as fp:
			for line in fp:
				line=line.rstrip('\n')
				sub,obj=line.split('\t')
				sub=' '.join((sub.split(":")[2]).split('_'))
				obj=' '.join((obj.split(":")[2]).split('_'))
				if sub in sub_table:
					tmp=sub_table[sub]
					tmp=tmp.union([r_name])
					sub_table[sub]=tmp
					#print("y")
				else:
					sub_table[sub]=set([r_name])
				if obj in obj_table:
					tmp=obj_table[obj]
					tmp=tmp.union([r_name])
					obj_table[obj]=tmp
					#print("yy")
				else:
					obj_table[obj]=set([r_name])
				#print len(sub_table[sub]),len(obj_table[obj])
	return sub_table,obj_table
def info_extract(u):
		
        final_string = ""
        twe=url.split(u)

        newtweet=""
        for a in range(len(twe)):
            newtweet = newtweet+twe[a]+" "

        text = sep.split(newtweet);
        tex=""    
        for i in range(len(text)):
                if(hasht.match(text[i]) or atp.match(text[i])):
                        m=text[i][1:]
                        text[i]=segment(m.lower())
                        n=""
                        for j in text[i]:
                            n=n+j+" "
                        text[i]=n
                tex+=text[i]+" "

        final_string=final_string+categorize(tex)+"####"
        final_string=final_string+babelnet(tex)+"####"
        twee = url.search(u)
        try:
            urls = str(twee.group(0))
            final_string=final_string+url_categ(urls)+"<br>"
        except:
            pass
        final_string=final_string+twe_cat(tex)+"####"
        final_string=final_string+senti(u)+"####"
        return final_string
Ejemplo n.º 7
0
def test_segment_9():
    result = [
        'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst',
        'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was',
        'the', 'age', 'of', 'foolishness'
    ]
    assert segment(''.join(result)) == result
Ejemplo n.º 8
0
def k_list_repeat(query):
	k = searchgoogle(query)
	m = []

	if socialListProxy:
		proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
		opener = ulib.build_opener(proxy)
		ulib.install_opener(opener)

	for i in xrange(len(k)):
		req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
		k[i] = segment(k[i])
		l = []
		for j in k[i]:
			l.append(stemming.porter2.stem(j))
		k[i] = " ".join(k[i])
		# print k[i]
		try:
			content = ulib.urlopen(req)
			#reading the title of url
			x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
			t = []
			for s in x:
				t.append(stemming.porter2.stem(s))
			t = " ".join(t)
			m.append(t)

		except:
			pass
	return m
Ejemplo n.º 9
0
def segment_hashtag(h):
    """segment the words inside the hashtag h, discard non alphanum chars"""
    if hasattr(h, "group"):
        h = h.group()[1:]
    else:
        h = h[1:]
    # print(h, " hashtag " + wordsegment.segment(h) + " . ")
    return " hashtag " + " ".join(wordsegment.segment(h)) + " , "
Ejemplo n.º 10
0
 def get_word_vector(self, word):
     if word is None:
         return None
     word = word.strip().strip('[').strip(']').strip('(').strip(')')
     word_lower = word.lower()
     word_upper = word.upper()
     try:
         if word_lower not in self.word_vectors_map:
             if config.debug:
                 print 'getting word vector for ', word
             if word in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word]
             #todo: if vocab us ensured to be lower case, this condition is not required
             elif word_lower in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word_lower]
             elif word_upper in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word_upper]
             else:
                 if not constants.concept_regexp.sub('', word):
                     return self.get_word_vector(constants.alpha_regex.sub('', word))
                 subwords = word.split()
                 if len(subwords) == 1:
                     subwords = word.split(',')
                     if len(subwords) == 1:
                         subwords = word.split('/')
                         if len(subwords) == 1:
                             subwords = word.split(':')
                             if len(subwords) == 1:
                                 subwords = word.split('-')
                                 if len(subwords) == 1:
                                     subwords = word.split('_')
                                     if len(subwords) == 1:
                                         # print 'performing word segmentation on ', word
                                         subwords = ws.segment(word.encode('utf8'))
                                         if len(subwords) == 1:
                                             print 'could not get wordvector for ', word
                                             self.word_vectors_map[word_lower] = None
                 if len(subwords) > 1:
                     curr_wordvec = None
                     for curr_subword in subwords:
                         curr_subword_vec = self.get_word_vector(curr_subword)
                         if curr_subword_vec is not None:
                             if curr_wordvec is None:
                                 curr_wordvec = curr_subword_vec
                             else:
                                 start_time = time.time()
                                 curr_wordvec = ss.fftconvolve(curr_wordvec, curr_subword_vec, mode='same')
                                 if config.debug:
                                     print 'performed fast fourier transform convolution on word vectors in {} seconds.'.format(time.time()-start_time)
                     self.word_vectors_map[word_lower] = curr_wordvec
         return self.word_vectors_map[word_lower]
     except UnicodeDecodeError as ude:
         print 'error getting word vector for ', word
         print ude.message
         self.word_vectors_map[word_lower] = None
         return self.word_vectors_map[word_lower]
Ejemplo n.º 11
0
def read_nell_relations():
	"""
		this function will read relations from nell graph
		
		return the list of relations
	"""
	rel=os.walk("nell/relations")
	relation=[]
	for i in rel:
		trel=i[2]
	for i in trel:
		relation.append(' '.join(segment(i.split(':')[1])))
	return relation
Ejemplo n.º 12
0
def test12(tagtocheck):
	d=en.Dict("en-US")
	correct = 0
	incorrect = 0
	words=ws.segment(tagtocheck)
	for x in words:
		if d.check(x)==False:
			incorrect+=1
		else:
			correct+=1
	if correct!= 0:
		return "%.4f"%(float(incorrect)/correct)
	else:
		return 0
Ejemplo n.º 13
0
def create_dict_adva():
	relation_name=[x[2] for x in os.walk("nell/relations")][0]
	sub_table={}
	obj_table={}
	for r in relation_name:
		lst=[]
		r_name=' '.join(segment(r.split(':')[1]))
		print r_name
		with open("nell/relations/"+r) as fp:
			for line in fp:
				line=line.rstrip('\n')
				sub,obj=line.split('\t')
				sub=sub.split(":")[1:]
				obj=obj.split(":")[1:]
				for tmp in sub:
					tmpsb=''.join(tmp.split('_'))
					tmpsb=segment(tmpsb)
					for sb in tmpsb:
						if sb in sub_table:
							tmp=sub_table[sb]
							tmp=tmp.union([r_name])
							sub_table[sb]=tmp
							#print("y")
						else:
							sub_table[sb]=set([r_name])
				for tmp in obj:
					tmpob=''.join(tmp.split('_'))
					tmpob=segment(tmpob)
					for ob in tmpob:
						if ob in obj_table:
							tmp=obj_table[ob]
							tmp=tmp.union([r_name])
							obj_table[ob]=tmp
							#print("yy")
						else:
							obj_table[ob]=set([r_name])
	return sub_table,obj_table
Ejemplo n.º 14
0
def read_relation_name(folder_name):
	"""
		This function will look inside the folder folder_name and fetch out all relations where relations are the name of inside folder names. Here each folder name should have name format "concept:relation".
		
		return the list of relations
	"""
	#print folder_name
	folder_list=[]
	#print folder_name
	tmp=[x[0] for x in os.walk(folder_name)]
	#print tmp
	for name in tmp[1:]:
		#print name
		folder_list.append(' '.join(segment(name.split(':')[1])))
	return folder_list[1:]
Ejemplo n.º 15
0
def checkTweetNums(tweets,minTweets):
	#number as adjective check
	count = 0
	processedtweets = []
	for line in tweets:
		processedtweets.append(" ".join(wordsegment.segment(line)))
	postags = cmu.runtagger_parse(processedtweets)
	for postag in postags:
		postag = "".join(postag)
		if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag:
			#Checking for Consecutive numbers and Nouns
			count += 1
	if count >= minTweets:
		return 1
	else:
		return 0
Ejemplo n.º 16
0
def pos_tag_entropy(tagtocheck,pos_list):
    seg_st = segment(tagtocheck)
    len_list=len(pos_list)
    arr = []
    freq_list =[]
    for i in xrange(len_list):
        arr.append(pos_list[i])
    k = Counter(arr) #counts no of pos tags and their multiplicity
    for x in k:
        freq = float(k[x])/len_list
        freq_list.append(freq)
    ent = 0.0
    for j in freq_list:
        ent = ent + j * math.log(j, 2)
    ent = -ent
    return "%.4f"%(float(ent))
Ejemplo n.º 17
0
def getchunks(password):
    # split into character/digit/symbols chunks
    temp = re.findall('([\W_]+|[a-zA-Z]+|[0-9]+)', password)

    # split character chunks into word chunks
    chunks = []
    for chunk in temp:
        if chunk[0].isalpha() and len(chunk) > 1:
            words = ws.segment(chunk)
            chunks.extend(words)
        else:
            chunks.append(chunk)

    if len(chunks) == 0:
        log.warning("Unable to chunk password: {}".format(password))

    return chunks
Ejemplo n.º 18
0
def getWeight(hashtag,text_file):
	#this function returns a list of weights of the strings in the text_file
	#proxy_handler
	proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'})
	opener = ulib.build_opener(proxy)
	ulib.install_opener(opener)
	#split the hashtag into words
	spl_hash = ws.segment(hashtag)
	req = ulib.Request('https://www.google.co.in/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"})
	dumpdata = ulib.urlopen(req).read()
	dumpdata = ulib.unquote(dumpdata)
	
	urls_ = re.findall("(http[s]*://[^:<&%]*?)[\"& ]",dumpdata)
	
	urls = Set()
	
	for _ in urls_:
		if not "google" in _ and not "youtube" in _:
			urls.add(_)
	
	occurance = []
	for _url in urls:
		try:
			temp = get_occurence_list(_url,text_file)
			occurance.append(temp)
			#frequencies of string for url _url
		except:
			pass

	#now occurance is a list of lists containing frequencies for each url
	
	final = [0 for _ in range(len(occurance[0]))]

	_length = len(occurance)
	#_length is total number of urls present
	
	for _x in range(len(occurance[0])):
		_x1 = 0
		for _o in occurance:
			final[_x] += _o[_x]*(_length-_x1)
			#multiplyinng frequency in each url with url position from bottom which gives weight
			_x1 += 1
	return final
Ejemplo n.º 19
0
def checkCategories(hashtag):
	matches =[]
	hashtag = " ".join(ws.segment(hashtag))
	matches.append(re.match(".+?in\s\d+\swords",hashtag))
	matches.append(re.match(".+?in\s\d+\ssentences",hashtag))
	matches.append(re.match(".*?\d+\sreasons.+",hashtag))
	matches.append(re.match(".*?\d+\swords\sto.+",hashtag))
	matches.append(re.match("^reasons\s.+",hashtag))
	matches.append(re.match(".*?ways\sto.+",hashtag))
	matches.append(re.match(".*?how\sto.+",hashtag))
	matches.append(re.match(".*?\d+\sways\sto.+",hashtag))
	matches.append(re.match(".*?\d+\sthings\sto.+",hashtag))
	matches.append(re.match("^things.+",hashtag))
	matches.append(re.match("^describe.*?in.*?",hashtag))
	matches.append(re.match("^name\ssome.+?",hashtag))
	#Add new catogories if found any
	for match in matches:
		if match:
			return 1
	return 0
Ejemplo n.º 20
0
def getWeight(hashtag="",string=""):
	proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'})
	opener = ulib.build_opener(proxy)
	ulib.install_opener(opener)
	spl_hash = ws.segment(hashtag)
	req = ulib.Request('https://www.google.com/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"})
	dumpdata = ulib.urlopen(req).read()
	urls = re.findall("(http.*?)[\" ]",dumpdata)
	weight = 0
	url = len(urls)
	occurance = []
	for _url in urls:
		req = ulib.Request(_url,headers={'User-Agent' : "Mozilla/5.0"})
		try:
			pagedata = ulib.urlopen(req).read()
			pagedata = pagedata.lower()
			occurance = re.findall(string.lower(),pagedata)
			weight+=len(occurance)*url
		except:
			pass
		url-=1
	return weight
Ejemplo n.º 21
0
def meaningful_characters(domain):
    if domain == '' or domain == ' '  or len(domain) == 0:

        return (0,0,-100.0)
#    domain_length = float(len(domain))
#    domain = ''.join([i for i in domain if not i.isdigit()])
    char_count = 0
    ratio = 0.0
    pairwise_score = -100.0
#    bigram_counts = bigram_counts
#    breakdowns = break_down(" ".join(domain))
    breakdowns = []
    breakdowns = segment(domain)
#    tri_gram_results = calc_ngram(domain, 3)
#    four_gram_results = calc_ngram(domain, 4)
#    five_gram_results = calc_ngram(domain, 5)
#    six_gram_results = calc_ngram(domain, 6)
#    for item in tri_gram_results:
#        breakdowns.append(item[0])
#    for item in four_gram_results:
#        breakdowns.append(item[0])
#    for item in five_gram_results:
#        breakdowns.append(item[0])
#    for item in six_gram_results:
#        breakdowns.append(item[0])


    for word in breakdowns:
#        if word in dictionary:
#            char_count = char_count + 1
        if dictionary.check(word):
            char_count = char_count + 1
    ratio = float(char_count)/len(breakdowns)
    pairwise_score = meaningful_pairwise(breakdowns, ratio)

#    print '[info]:domain %s has been broken into %s words:%s. The meaningful score is %s. The pairwise meaningful score is %s\n' %(domain,str(len(breakdowns)),breakdowns, str(ratio), str(pairwise_score))

    return (ratio,len(breakdowns), float(pairwise_score))
Ejemplo n.º 22
0
def test_segment_6():
    result = ['now', 'is', 'the', 'time', 'for', 'all', 'good']
    assert segment(''.join(result)) == result
Ejemplo n.º 23
0
#for appending data in a file
import wordsegment
prefix = ''
suffix = ' f'
suffix1 = ' 0'
with open('../final_idiom_29k.txt', 'r') as src:
    with open('../', 'w') as dest:
       for line in src:
           seg_line = wordsegment.segment(line)
           if("you" in seg_line):
               dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix))
           else:
               dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix1))
Ejemplo n.º 24
0
def test_segment_8():
    result = [
        'it', 'was', 'a', 'bright', 'cold', 'day', 'in', 'april', 'and', 'the',
        'clocks', 'were', 'striking', 'thirteen'
    ]
    assert segment(''.join(result)) == result
Ejemplo n.º 25
0
file = open(argv[1]) #file containing socialList and nonSocialList hashtags
file_type = open(argv[2]) #file containing the types of hastags
tofile = open(argv[3],"w") #file to take output arff
tofile.close()
idiomsEx = file.readlines()
list_type = file_type.readlines()

sociallists = [] # to take hashtags in a list

for line in idiomsEx:
	sociallists.append(line.replace("\n",""))

parsedSociallists = [] #parse the hashtags using str2num library and add them as a list

for line in sociallists:
	parsedSociallists.append(str2num.words2num(" ".join(ws.segment(line))))

postags = cmu.runtagger_parse(parsedSociallists) #gets a list of postags each for each hashtag

i = 0

for ParsedTag,postag,type in zip(parsedSociallists,postags,list_type):
	checkTweetsret = checkTweets.checkTweets(ParsedTag.replace(" ",""),"test/"+str(i/100)+"tweets.txt")
	#checks for the hashtag in the files provided.

	i+=1

	tofile = open(argv[3],"a")
	tofile.write(str(testFile1.test1(ParsedTag))+","+ #number of charcters in hashtag
	str(testFile2.test2(ParsedTag))+","+ #number of words in hashtag
	str(testFile4.test4(ParsedTag))+","+ #presence of days
 def segment(self, word):
     cleaned = clean(word)
     segmented = segment(cleaned)
     return segmented
Ejemplo n.º 27
0
#for appending data in a file
import wordsegment
prefix = ''
suffix = ' f'
suffix1 = ' 0'
with open('../final_idiom_29k.txt', 'r') as src:
    with open('../', 'w') as dest:
        for line in src:
            seg_line = wordsegment.segment(line)
            if ("you" in seg_line):
                dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix))
            else:
                dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix1))
Ejemplo n.º 28
0
def test_segment_5():
    result = ['speed', 'of', 'art']
    assert segment(''.join(result)) == result
Ejemplo n.º 29
0
def nodes_saved(s):
    res0 = wordsegment.segment(s)
    res1 = segment_method2(s)
    return res0[2], res1[2]
Ejemplo n.º 30
0
import CMUTweetTagger as cmu
import wordsegment as ws

file = open(argv[1]) #file containing socialList and nonsocialList hashtags
tofile = open(argv[2], "w") #file that takes the arff output
tofile.close()
idiomsEx = file.readlines()
sociallists = []

for line in idiomsEx:
	sociallists.append(line.replace("\n", ""))

parsedSociallists = []

for line in sociallists:
	parsedSociallists.append(" ".join(ws.segment(line)))

postags = cmu.runtagger_parse(parsedSociallists)

'''
file output would be in the format of popularity,precision at 10,precision at 20 in each line for every hashtag

This takes a lot of time to run.
'''

for ParsedTag, postag in zip(parsedSociallists, postags):
	tofile = open(argv[2], "a")
	a = testFile14.test14(ParsedTag, postag)
	#checks the hashtag in google and returns list of its popularity precision at 10 urls and 20 urls
	print str(a[0]) + "," + str(a[1]) + "," + str(a[2])
	tofile.write(str(a[0]) + "," + str(a[1]) + "," + str(a[2]) + "\n")
Ejemplo n.º 31
0
def test_segment_7():
    result = ['it', 'is', 'a', 'truth', 'universally', 'acknowledged']
    assert segment(''.join(result)) == result
Ejemplo n.º 32
0
def test_segment_4():
    result = ['experts', 'exchange']
    assert segment(''.join(result)) == result
Ejemplo n.º 33
0
def test_segment_3():
    result = ['who', 'represents']
    assert segment(''.join(result)) == result
Ejemplo n.º 34
0
def test_segment_2():
    result = [
        'when', 'in', 'the', 'course', 'of', 'human', 'events', 'it',
        'becomes', 'necessary'
    ]
    assert segment(''.join(result)) == result
Ejemplo n.º 35
0
def test_segment_1():
    result = ['this', 'is', 'a', 'test']
    assert segment(''.join(result)) == result
Ejemplo n.º 36
0
def test_segment_0():
    result = ['choose', 'spain']
    assert segment(''.join(result)) == result
Ejemplo n.º 37
0
def load_dataset(trFile=None, teFile=None):
    labelsAsNums = {}
    numsAsLabels = {}
    labelNum = 0
    numTweets = 0
    testTweets = []

    x_train = []
    y_train = []
    x_test = []

    # NULI used a max_sequence_length of 64
    max_sequence_length = 64
    wordsegment.load()

    #load in train tweets and corresponding labels
    if (trFile):
        with open(trFile, 'r') as csvfile:
            tweetreader = csv.reader(csvfile, delimiter='\t')
            for tweet in tweetreader:
                text = tweet[1].lower().strip()
                # uncomment to convert non standard characters to standard
                # text = convertChars(text)

                # emoji used in NULI: https://github.com/carpedm20/emoji
                # wordsegment used in NULI: https://github.com/grantjenks/python-wordsegment
                text = ' '.join(wordsegment.segment(emoji.demojize(text)))

                #if(len(text.split()) > max_sequence_length):
                #    max_sequence_length = len(text.split())

                # NULI replaced URL with http
                text = text.replace('URL', 'http')

                #x_train.append(text)

                # NULI limited @USER to three instances
                text = text.split()
                user_count = 0
                out_text = []

                for word in text:
                    if (word == '@USER'):
                        user_count += 1
                    else:
                        user_count = 0

                    if (user_count <= 3):
                        out_text.append(word)
                text = ' '.join(out_text)

                x_train.append(text)

                if tweet[2] not in labelsAsNums:
                    labelsAsNums[tweet[2]] = labelNum
                    numsAsLabels[labelNum] = tweet[2]
                    labelNum += 1
                y_train.append(labelsAsNums[tweet[2]])

    #load in test tweets and corresponding labels
    if (teFile):
        with open(teFile, 'r') as csvfile:
            tweetreader = csv.reader(csvfile, delimiter='\t')
            for tweet in tweetreader:
                text = tweet[1].lower().strip()

                text = convertChars(text)

                # emoji used in NULI: https://github.com/carpedm20/emoji
                # wordsegment used in NULI: https://github.com/grantjenks/python-wordsegment
                text = ' '.join(wordsegment.segment(emoji.demojize(text)))

                # NULI replaced URL with http
                text = text.replace('URL', 'http')

                # NULI limited @USER to three instances
                text = text.split()
                user_count = 0
                out_text = []

                for word in text:
                    if (word == '@USER'):
                        user_count += 1
                    else:
                        user_count = 0

                    if (user_count <= 3):
                        out_text.append(word)
                text = ' '.join(out_text)

                testTweets.append(tweet)
                x_test.append(text)

    return x_train, y_train, x_test, labelNum, testTweets, labelsAsNums, numsAsLabels, max_sequence_length
Ejemplo n.º 38
0
from wordsegment import load, segment
load()
result = segment('thisisatest')
print(result)
Ejemplo n.º 39
0
    sent_files = sys.argv[1:]

    for sent_file in sent_files:
        r = open(sent_file, "r")

        s = open(sent_file + "_sensitive.list", "w", buffering=0)
        if not os.path.isfile(sent_file):
            print(sent_file + " isn't exist!")
            continue
        s.write("\n" + sent_file + "\n")

        # read lines
        for raw_line in r.readlines():
            sentence = ""
            segmentation = []
            line = ' '.join(segment(raw_line))

            if len(line.split()) < 2:
                continue
            line = line.replace(".", "").strip()
            print("\nRAW: " + line)

            # replace the abbreviation in dicts of sentences
            for word in line.split(' '):
                replace_flag = 0
                for words_abbr in WORDS_ABBR.keys():
                    if word == words_abbr or (words_abbr in word
                                              and WORDS_ABBR[words_abbr]
                                              not in line):
                        line = line.replace(words_abbr,
                                            " " + WORDS_ABBR[words_abbr] +
Ejemplo n.º 40
0
# Copyright (C) 2017 Yerai Doval
# You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/gpl.txt>

import wordsegment as ws
import nltk
import pickle
import sys


def identity(s):
    return s


def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)


if len(sys.argv) == 3:
    ws.UNIGRAMS = load_obj(sys.argv[1])
    ws.BIGRAMS = load_obj(sys.argv[2])
    ws.TOTAL = float(sum(ws.UNIGRAMS.values()))

ws.clean = identity

for line in sys.stdin:
    line = line.replace("\n", "")
    seg = " ".join(ws.segment(line))
    print(line + "\t" + " ".join(nltk.word_tokenize(seg)).replace(
        "``", "\"").replace("\'\'", "\"") + "\t N/A")
Ejemplo n.º 41
0
@Licence :
	This work is licensed under the
	Creative Commons Attribution-NonCommercial-ShareAlike 4.0
	International License. To view a copy of this license,
	visit http://creativecommons.org/licenses/by-nc-sa/4.0/.
'''

import CMUTweetTagger as cmu
import wordsegment as ws

file1 = open()
file2 = open()

data1 = file1.read()
data2 = file2.read()

tweets1 = data1.split("\n\n")

hashtags = []

for tweet1 in tweets1:
	hashtag = tweet1.split("\n")[0]
	hashtags.append(" ".join(ws.segment(hashtag)))

postags = cmu.runtagger_parse(hashtags)

i=0

for postag in postags:
	if '$' in "".join(postag):
		i+=1
Ejemplo n.º 42
0
def load_dataset(trFile=None, teFile=None):
    labelsAsNums = {}
    numsAsLabels = {}
    labelNum = 0
    numTweets = 0
    testTweets = []

    x_train = []
    y_train = []
    x_test = []

    max_sequence_length = -1
    wordsegment.load()

    if (trFile):
        #load in train tweets and corresponding labels
        with open(trFile, 'r') as csvfile:
            tweetreader = csv.reader(csvfile, delimiter='\t')
            for tweet in tweetreader:
                text = tweet[1].strip()

                #replace any non standard characters with standard
                #text = convertChars(text)

                # 'The symbols ’@’ and ’#’ were excluded from the list due to their specific semantics in tweets.' - vradivchev
                text = text.replace('#', '')
                text = text.replace('@', '')

                # 'All occurrences of tokens beginning with a hashtag were split into the separate words
                # comprising the token, provided that each separate word is uppercased ' - vradivchev
                text = ' '.join(wordsegment.segment(text))

                # 'Afterwards the tweets were subjected to tokenization and lowercasing' - vradivchev
                text = " ".join(nltk.word_tokenize(text))
                text = text.lower()

                # 'Afterwards we proceeded with removing a variety of different stop words' - vradivchev
                stop_words = set(stopwords.words('english'))
                text = ' '.join(
                    [w for w in text.split() if not w in stop_words])

                x_train.append(text)
                if (len(text.split()) > max_sequence_length):
                    max_sequence_length = len(text.split())

                if tweet[2] not in labelsAsNums:
                    labelsAsNums[tweet[2]] = labelNum
                    numsAsLabels[str(labelNum)] = tweet[2]
                    labelNum += 1
                y_train.append(labelsAsNums[tweet[2]])

    #load in test tweets and corresponding labels
    if (teFile):
        with open(teFile, 'r') as csvfile:
            tweetreader = csv.reader(csvfile, delimiter='\t')
            for tweet in tweetreader:
                text = tweet[1].strip()

                text = convertChars(text)
                # 'The symbols ’@’ and ’#’ were excluded from the list due to their specific semantics in tweets.' - vradivchev
                text = text.replace('#', '')
                text = text.replace('@', '')

                # 'All occurrences of tokens beginning with a hashtag were split into the separate words
                # comprising the token, provided that each separate word is uppercased ' - vradivchev
                text = ' '.join(wordsegment.segment(text))

                # 'Afterwards the tweets were subjected to tokenization and lowercasing' - vradivchev
                text = " ".join(nltk.word_tokenize(text))
                text = text.lower()

                # 'Afterwards we proceeded with removing a variety of different stop words' - vradivchev
                stop_words = set(stopwords.words('english'))
                text = ' '.join(
                    [w for w in text.split() if not w in stop_words])

                testTweets.append(tweet)
                x_test.append(text)

    return x_train, y_train, x_test, labelNum, testTweets, labelsAsNums, numsAsLabels, max_sequence_length
Ejemplo n.º 43
0
    temp = [float(embed) for embed in temp]
    all_vector.append(temp)
vector_file.close()

new_word2vec300 = {}
for i in range(len(all_word_unique)):
    new_word2vec300[all_word_unique[i]] = all_vector[i]

pickle.dump(new_word2vec300, open('new_word2vec300.pickle', 'wb'))
new_word2vec300 = pickle.load(open('new_word2vec300.pickle', 'rb'))

# max_count = 0
tweet_embedding = []
for tweet in tweet_list:
    #     temp = tweet.split(" ")
    temp = segment(tweet.replace('@USER', ''))
    sentence_embed = np.zeros(300)
    word_count = 0
    for word in temp:
        try:
            #             sentence_embed += word2vec300[word]
            sentence_embed += new_word2vec300[word]
            word_count += 1
        except:
            pass
#     if word_count > max_count:
#         max_count = word_count
#         print(tweet)
    tweet_embedding.append(sentence_embed / word_count)

tweet_embedding_b = np.array(tweet_embedding)[~pd.isnull(subtask_blist)]
Ejemplo n.º 44
0
def test_segment_12():
    assert segment('faroutintheunchartedbackwatersoftheunfashionableendofthewesternspiralarmofthegalaxyliesasmallunregardedyellowsun') == ['far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the', 'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of', 'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun']
Ejemplo n.º 45
0
def split(word):
    return jsonify(result=segment(word))
Ejemplo n.º 46
0
def test_segment_0():
    assert segment('choosespain') == ['choose', 'spain']
Ejemplo n.º 47
0
def test14(parsedTag, postag):
    nounpart = []
    k = 0
    ret = []
    splitline = parsedTag.split()
    for x in postag:
        if (x is 'M' or x is '^' or x is 'Z'):
            nounpart.append(splitline[k])
        k += 1

    if " ".join(nounpart) == "":
        ret.append(2)
    while True:
        try:
            googledata = searchWeb.searchgoogle(parsedTag)
            #gets all the urls for the hashtag on google search
            break
        except:
            continue
    count = 0
    i = 1
    for site in googledata:
        try:
            if searchWeb.searchforstring(site, nounpart):
                #checks if the hashtag noun parts are popular by counting the number of websites they are present
                count += 1
        except:
            pass
        i += 1
        if i > 10:
            break
    if count > 5:
        ret.append(1)
    else:
        ret.append(0)
    seg = parsedTag.split()
    m = []
    for n in seg:
        m.append(stemming.porter2.stem(n))
    seg = " ".join(m)

    if socialListProxy:

        proxy = ulib.ProxyHandler({
            'http': socialListHttp_Proxy,
            'https': socialListHttps_Proxy
        })
        opener = ulib.build_opener(proxy)
        ulib.install_opener(opener)

    counter = 0
    total = 0
    for site in googledata:
        req = ulib.Request(site, headers={'User-Agent': "Mozilla/5.0"})
        site = segment(site)
        l = []
        for j in site:
            l.append(stemming.porter2.stem(j))
        site = " ".join(l)
        try:
            content = ulib.urlopen(req)
            x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>",
                           content.read())
            #searches for a match of hastag in the title and url of every page
            t = []
            for s in x:
                t.append(stemming.porter2.stem(s))
            t = " ".join(t)
            if ((seg in site) or (seg in t)):
                counter = counter + 1
            total = total + 1
        except:
            pass

        if (total == 10):
            ret.append("%.4f" % (float(counter) / total))
        if (total == 20):
            ret.append("%.4f" % (float(counter) / total))
            break

    if total < 10:
        ret.append("%.4f" % (float(counter) / 10.0))
        ret.append("%.4f" % (counter / 20.0))
    elif total < 20:
        ret.append("%.4f" % (float(counter) / 20.0))
    return ret
Ejemplo n.º 48
0
def test_segment_1():
    assert segment('thisisatest') == ['this', 'is', 'a', 'test']
Ejemplo n.º 49
0
def performSegmentation(word):
    flag = word.endswith('.')
    new_word = " ".join(segment(word))
    if flag:
        new_word += '.'
    return new_word
Ejemplo n.º 50
0
import wordsegment
from wordsegment import segment
segment('thisisatest')