def main(arg="iamtoocoolforthis"): s = clean(arg) print "CLEANED STRING:", s print "======================RUNNING OPTIMIZED===================" print segment_method1(s) print "======================RUNNING VANILLA===================" print segment(s)
def precisioncalc(query): print query, k = searchgoogle(query) seg = segment(query) m = [] for n in seg: m.append(stemming.porter2.stem(n)) seg = " ".join(m) if socialListProxy: proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy}) opener = ulib.build_opener(proxy) ulib.install_opener(opener) counter = 0 total = 0 for i in xrange(len(k)): req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"}) k[i] = segment(k[i]) l = [] for j in k[i]: l.append(stemming.porter2.stem(j)) k[i] = " ".join(k[i]) # print k[i] try: content = ulib.urlopen(req) x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read()) t = [] for s in x: t.append(stemming.porter2.stem(s)) t = " ".join(t) # print t if ((seg in k[i]) or (seg in t)): counter = counter + 1 total = total + 1 except: pass if (total == 10): print str(counter)+"/"+str(total), if (total == 20): print str(counter)+"/"+str(total), if total < 10: print str(counter)+"/"+str(10), str(counter)+"/"+str(20) elif total < 20: print str(counter)+"/"+str(20) else: print "" #precisioncalc("madhusai") #uncomment this to check the presion of some word
def test_segment_10(): result = [ 'as', 'gregor', 'samsa', 'awoke', 'one', 'morning', 'from', 'uneasy', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'gigantic', 'insect' ] assert segment(''.join(result)) == result
def test_segment_12(): result = [ 'far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the', 'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of', 'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun' ] assert segment(''.join(result)) == result
def create_dict(): relation_name=[x[2] for x in os.walk("nell/relations")][0] sub_table={} obj_table={} for r in relation_name: lst=[] r_name=' '.join(segment(r.split(':')[1])) print r_name with open("nell/relations/"+r) as fp: for line in fp: line=line.rstrip('\n') sub,obj=line.split('\t') sub=' '.join((sub.split(":")[2]).split('_')) obj=' '.join((obj.split(":")[2]).split('_')) if sub in sub_table: tmp=sub_table[sub] tmp=tmp.union([r_name]) sub_table[sub]=tmp #print("y") else: sub_table[sub]=set([r_name]) if obj in obj_table: tmp=obj_table[obj] tmp=tmp.union([r_name]) obj_table[obj]=tmp #print("yy") else: obj_table[obj]=set([r_name]) #print len(sub_table[sub]),len(obj_table[obj]) return sub_table,obj_table
def info_extract(u): final_string = "" twe=url.split(u) newtweet="" for a in range(len(twe)): newtweet = newtweet+twe[a]+" " text = sep.split(newtweet); tex="" for i in range(len(text)): if(hasht.match(text[i]) or atp.match(text[i])): m=text[i][1:] text[i]=segment(m.lower()) n="" for j in text[i]: n=n+j+" " text[i]=n tex+=text[i]+" " final_string=final_string+categorize(tex)+"####" final_string=final_string+babelnet(tex)+"####" twee = url.search(u) try: urls = str(twee.group(0)) final_string=final_string+url_categ(urls)+"<br>" except: pass final_string=final_string+twe_cat(tex)+"####" final_string=final_string+senti(u)+"####" return final_string
def test_segment_9(): result = [ 'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was', 'the', 'age', 'of', 'foolishness' ] assert segment(''.join(result)) == result
def k_list_repeat(query): k = searchgoogle(query) m = [] if socialListProxy: proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy}) opener = ulib.build_opener(proxy) ulib.install_opener(opener) for i in xrange(len(k)): req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"}) k[i] = segment(k[i]) l = [] for j in k[i]: l.append(stemming.porter2.stem(j)) k[i] = " ".join(k[i]) # print k[i] try: content = ulib.urlopen(req) #reading the title of url x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read()) t = [] for s in x: t.append(stemming.porter2.stem(s)) t = " ".join(t) m.append(t) except: pass return m
def segment_hashtag(h): """segment the words inside the hashtag h, discard non alphanum chars""" if hasattr(h, "group"): h = h.group()[1:] else: h = h[1:] # print(h, " hashtag " + wordsegment.segment(h) + " . ") return " hashtag " + " ".join(wordsegment.segment(h)) + " , "
def get_word_vector(self, word): if word is None: return None word = word.strip().strip('[').strip(']').strip('(').strip(')') word_lower = word.lower() word_upper = word.upper() try: if word_lower not in self.word_vectors_map: if config.debug: print 'getting word vector for ', word if word in self.word2vec_model.vocab: self.word_vectors_map[word_lower] = self.word2vec_model[word] #todo: if vocab us ensured to be lower case, this condition is not required elif word_lower in self.word2vec_model.vocab: self.word_vectors_map[word_lower] = self.word2vec_model[word_lower] elif word_upper in self.word2vec_model.vocab: self.word_vectors_map[word_lower] = self.word2vec_model[word_upper] else: if not constants.concept_regexp.sub('', word): return self.get_word_vector(constants.alpha_regex.sub('', word)) subwords = word.split() if len(subwords) == 1: subwords = word.split(',') if len(subwords) == 1: subwords = word.split('/') if len(subwords) == 1: subwords = word.split(':') if len(subwords) == 1: subwords = word.split('-') if len(subwords) == 1: subwords = word.split('_') if len(subwords) == 1: # print 'performing word segmentation on ', word subwords = ws.segment(word.encode('utf8')) if len(subwords) == 1: print 'could not get wordvector for ', word self.word_vectors_map[word_lower] = None if len(subwords) > 1: curr_wordvec = None for curr_subword in subwords: curr_subword_vec = self.get_word_vector(curr_subword) if curr_subword_vec is not None: if curr_wordvec is None: curr_wordvec = curr_subword_vec else: start_time = time.time() curr_wordvec = ss.fftconvolve(curr_wordvec, curr_subword_vec, mode='same') if config.debug: print 'performed fast fourier transform convolution on word vectors in {} seconds.'.format(time.time()-start_time) self.word_vectors_map[word_lower] = curr_wordvec return self.word_vectors_map[word_lower] except UnicodeDecodeError as ude: print 'error getting word vector for ', word print ude.message self.word_vectors_map[word_lower] = None return self.word_vectors_map[word_lower]
def read_nell_relations(): """ this function will read relations from nell graph return the list of relations """ rel=os.walk("nell/relations") relation=[] for i in rel: trel=i[2] for i in trel: relation.append(' '.join(segment(i.split(':')[1]))) return relation
def test12(tagtocheck): d=en.Dict("en-US") correct = 0 incorrect = 0 words=ws.segment(tagtocheck) for x in words: if d.check(x)==False: incorrect+=1 else: correct+=1 if correct!= 0: return "%.4f"%(float(incorrect)/correct) else: return 0
def create_dict_adva(): relation_name=[x[2] for x in os.walk("nell/relations")][0] sub_table={} obj_table={} for r in relation_name: lst=[] r_name=' '.join(segment(r.split(':')[1])) print r_name with open("nell/relations/"+r) as fp: for line in fp: line=line.rstrip('\n') sub,obj=line.split('\t') sub=sub.split(":")[1:] obj=obj.split(":")[1:] for tmp in sub: tmpsb=''.join(tmp.split('_')) tmpsb=segment(tmpsb) for sb in tmpsb: if sb in sub_table: tmp=sub_table[sb] tmp=tmp.union([r_name]) sub_table[sb]=tmp #print("y") else: sub_table[sb]=set([r_name]) for tmp in obj: tmpob=''.join(tmp.split('_')) tmpob=segment(tmpob) for ob in tmpob: if ob in obj_table: tmp=obj_table[ob] tmp=tmp.union([r_name]) obj_table[ob]=tmp #print("yy") else: obj_table[ob]=set([r_name]) return sub_table,obj_table
def read_relation_name(folder_name): """ This function will look inside the folder folder_name and fetch out all relations where relations are the name of inside folder names. Here each folder name should have name format "concept:relation". return the list of relations """ #print folder_name folder_list=[] #print folder_name tmp=[x[0] for x in os.walk(folder_name)] #print tmp for name in tmp[1:]: #print name folder_list.append(' '.join(segment(name.split(':')[1]))) return folder_list[1:]
def checkTweetNums(tweets,minTweets): #number as adjective check count = 0 processedtweets = [] for line in tweets: processedtweets.append(" ".join(wordsegment.segment(line))) postags = cmu.runtagger_parse(processedtweets) for postag in postags: postag = "".join(postag) if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag: #Checking for Consecutive numbers and Nouns count += 1 if count >= minTweets: return 1 else: return 0
def pos_tag_entropy(tagtocheck,pos_list): seg_st = segment(tagtocheck) len_list=len(pos_list) arr = [] freq_list =[] for i in xrange(len_list): arr.append(pos_list[i]) k = Counter(arr) #counts no of pos tags and their multiplicity for x in k: freq = float(k[x])/len_list freq_list.append(freq) ent = 0.0 for j in freq_list: ent = ent + j * math.log(j, 2) ent = -ent return "%.4f"%(float(ent))
def getchunks(password): # split into character/digit/symbols chunks temp = re.findall('([\W_]+|[a-zA-Z]+|[0-9]+)', password) # split character chunks into word chunks chunks = [] for chunk in temp: if chunk[0].isalpha() and len(chunk) > 1: words = ws.segment(chunk) chunks.extend(words) else: chunks.append(chunk) if len(chunks) == 0: log.warning("Unable to chunk password: {}".format(password)) return chunks
def getWeight(hashtag,text_file): #this function returns a list of weights of the strings in the text_file #proxy_handler proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'}) opener = ulib.build_opener(proxy) ulib.install_opener(opener) #split the hashtag into words spl_hash = ws.segment(hashtag) req = ulib.Request('https://www.google.co.in/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"}) dumpdata = ulib.urlopen(req).read() dumpdata = ulib.unquote(dumpdata) urls_ = re.findall("(http[s]*://[^:<&%]*?)[\"& ]",dumpdata) urls = Set() for _ in urls_: if not "google" in _ and not "youtube" in _: urls.add(_) occurance = [] for _url in urls: try: temp = get_occurence_list(_url,text_file) occurance.append(temp) #frequencies of string for url _url except: pass #now occurance is a list of lists containing frequencies for each url final = [0 for _ in range(len(occurance[0]))] _length = len(occurance) #_length is total number of urls present for _x in range(len(occurance[0])): _x1 = 0 for _o in occurance: final[_x] += _o[_x]*(_length-_x1) #multiplyinng frequency in each url with url position from bottom which gives weight _x1 += 1 return final
def checkCategories(hashtag): matches =[] hashtag = " ".join(ws.segment(hashtag)) matches.append(re.match(".+?in\s\d+\swords",hashtag)) matches.append(re.match(".+?in\s\d+\ssentences",hashtag)) matches.append(re.match(".*?\d+\sreasons.+",hashtag)) matches.append(re.match(".*?\d+\swords\sto.+",hashtag)) matches.append(re.match("^reasons\s.+",hashtag)) matches.append(re.match(".*?ways\sto.+",hashtag)) matches.append(re.match(".*?how\sto.+",hashtag)) matches.append(re.match(".*?\d+\sways\sto.+",hashtag)) matches.append(re.match(".*?\d+\sthings\sto.+",hashtag)) matches.append(re.match("^things.+",hashtag)) matches.append(re.match("^describe.*?in.*?",hashtag)) matches.append(re.match("^name\ssome.+?",hashtag)) #Add new catogories if found any for match in matches: if match: return 1 return 0
def getWeight(hashtag="",string=""): proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'}) opener = ulib.build_opener(proxy) ulib.install_opener(opener) spl_hash = ws.segment(hashtag) req = ulib.Request('https://www.google.com/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"}) dumpdata = ulib.urlopen(req).read() urls = re.findall("(http.*?)[\" ]",dumpdata) weight = 0 url = len(urls) occurance = [] for _url in urls: req = ulib.Request(_url,headers={'User-Agent' : "Mozilla/5.0"}) try: pagedata = ulib.urlopen(req).read() pagedata = pagedata.lower() occurance = re.findall(string.lower(),pagedata) weight+=len(occurance)*url except: pass url-=1 return weight
def meaningful_characters(domain): if domain == '' or domain == ' ' or len(domain) == 0: return (0,0,-100.0) # domain_length = float(len(domain)) # domain = ''.join([i for i in domain if not i.isdigit()]) char_count = 0 ratio = 0.0 pairwise_score = -100.0 # bigram_counts = bigram_counts # breakdowns = break_down(" ".join(domain)) breakdowns = [] breakdowns = segment(domain) # tri_gram_results = calc_ngram(domain, 3) # four_gram_results = calc_ngram(domain, 4) # five_gram_results = calc_ngram(domain, 5) # six_gram_results = calc_ngram(domain, 6) # for item in tri_gram_results: # breakdowns.append(item[0]) # for item in four_gram_results: # breakdowns.append(item[0]) # for item in five_gram_results: # breakdowns.append(item[0]) # for item in six_gram_results: # breakdowns.append(item[0]) for word in breakdowns: # if word in dictionary: # char_count = char_count + 1 if dictionary.check(word): char_count = char_count + 1 ratio = float(char_count)/len(breakdowns) pairwise_score = meaningful_pairwise(breakdowns, ratio) # print '[info]:domain %s has been broken into %s words:%s. The meaningful score is %s. The pairwise meaningful score is %s\n' %(domain,str(len(breakdowns)),breakdowns, str(ratio), str(pairwise_score)) return (ratio,len(breakdowns), float(pairwise_score))
def test_segment_6(): result = ['now', 'is', 'the', 'time', 'for', 'all', 'good'] assert segment(''.join(result)) == result
#for appending data in a file import wordsegment prefix = '' suffix = ' f' suffix1 = ' 0' with open('../final_idiom_29k.txt', 'r') as src: with open('../', 'w') as dest: for line in src: seg_line = wordsegment.segment(line) if("you" in seg_line): dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix)) else: dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix1))
def test_segment_8(): result = [ 'it', 'was', 'a', 'bright', 'cold', 'day', 'in', 'april', 'and', 'the', 'clocks', 'were', 'striking', 'thirteen' ] assert segment(''.join(result)) == result
file = open(argv[1]) #file containing socialList and nonSocialList hashtags file_type = open(argv[2]) #file containing the types of hastags tofile = open(argv[3],"w") #file to take output arff tofile.close() idiomsEx = file.readlines() list_type = file_type.readlines() sociallists = [] # to take hashtags in a list for line in idiomsEx: sociallists.append(line.replace("\n","")) parsedSociallists = [] #parse the hashtags using str2num library and add them as a list for line in sociallists: parsedSociallists.append(str2num.words2num(" ".join(ws.segment(line)))) postags = cmu.runtagger_parse(parsedSociallists) #gets a list of postags each for each hashtag i = 0 for ParsedTag,postag,type in zip(parsedSociallists,postags,list_type): checkTweetsret = checkTweets.checkTweets(ParsedTag.replace(" ",""),"test/"+str(i/100)+"tweets.txt") #checks for the hashtag in the files provided. i+=1 tofile = open(argv[3],"a") tofile.write(str(testFile1.test1(ParsedTag))+","+ #number of charcters in hashtag str(testFile2.test2(ParsedTag))+","+ #number of words in hashtag str(testFile4.test4(ParsedTag))+","+ #presence of days
def segment(self, word): cleaned = clean(word) segmented = segment(cleaned) return segmented
#for appending data in a file import wordsegment prefix = '' suffix = ' f' suffix1 = ' 0' with open('../final_idiom_29k.txt', 'r') as src: with open('../', 'w') as dest: for line in src: seg_line = wordsegment.segment(line) if ("you" in seg_line): dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix)) else: dest.write('%s%s%s\n' % (prefix, line.rstrip('\n'), suffix1))
def test_segment_5(): result = ['speed', 'of', 'art'] assert segment(''.join(result)) == result
def nodes_saved(s): res0 = wordsegment.segment(s) res1 = segment_method2(s) return res0[2], res1[2]
import CMUTweetTagger as cmu import wordsegment as ws file = open(argv[1]) #file containing socialList and nonsocialList hashtags tofile = open(argv[2], "w") #file that takes the arff output tofile.close() idiomsEx = file.readlines() sociallists = [] for line in idiomsEx: sociallists.append(line.replace("\n", "")) parsedSociallists = [] for line in sociallists: parsedSociallists.append(" ".join(ws.segment(line))) postags = cmu.runtagger_parse(parsedSociallists) ''' file output would be in the format of popularity,precision at 10,precision at 20 in each line for every hashtag This takes a lot of time to run. ''' for ParsedTag, postag in zip(parsedSociallists, postags): tofile = open(argv[2], "a") a = testFile14.test14(ParsedTag, postag) #checks the hashtag in google and returns list of its popularity precision at 10 urls and 20 urls print str(a[0]) + "," + str(a[1]) + "," + str(a[2]) tofile.write(str(a[0]) + "," + str(a[1]) + "," + str(a[2]) + "\n")
def test_segment_7(): result = ['it', 'is', 'a', 'truth', 'universally', 'acknowledged'] assert segment(''.join(result)) == result
def test_segment_4(): result = ['experts', 'exchange'] assert segment(''.join(result)) == result
def test_segment_3(): result = ['who', 'represents'] assert segment(''.join(result)) == result
def test_segment_2(): result = [ 'when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary' ] assert segment(''.join(result)) == result
def test_segment_1(): result = ['this', 'is', 'a', 'test'] assert segment(''.join(result)) == result
def test_segment_0(): result = ['choose', 'spain'] assert segment(''.join(result)) == result
def load_dataset(trFile=None, teFile=None): labelsAsNums = {} numsAsLabels = {} labelNum = 0 numTweets = 0 testTweets = [] x_train = [] y_train = [] x_test = [] # NULI used a max_sequence_length of 64 max_sequence_length = 64 wordsegment.load() #load in train tweets and corresponding labels if (trFile): with open(trFile, 'r') as csvfile: tweetreader = csv.reader(csvfile, delimiter='\t') for tweet in tweetreader: text = tweet[1].lower().strip() # uncomment to convert non standard characters to standard # text = convertChars(text) # emoji used in NULI: https://github.com/carpedm20/emoji # wordsegment used in NULI: https://github.com/grantjenks/python-wordsegment text = ' '.join(wordsegment.segment(emoji.demojize(text))) #if(len(text.split()) > max_sequence_length): # max_sequence_length = len(text.split()) # NULI replaced URL with http text = text.replace('URL', 'http') #x_train.append(text) # NULI limited @USER to three instances text = text.split() user_count = 0 out_text = [] for word in text: if (word == '@USER'): user_count += 1 else: user_count = 0 if (user_count <= 3): out_text.append(word) text = ' '.join(out_text) x_train.append(text) if tweet[2] not in labelsAsNums: labelsAsNums[tweet[2]] = labelNum numsAsLabels[labelNum] = tweet[2] labelNum += 1 y_train.append(labelsAsNums[tweet[2]]) #load in test tweets and corresponding labels if (teFile): with open(teFile, 'r') as csvfile: tweetreader = csv.reader(csvfile, delimiter='\t') for tweet in tweetreader: text = tweet[1].lower().strip() text = convertChars(text) # emoji used in NULI: https://github.com/carpedm20/emoji # wordsegment used in NULI: https://github.com/grantjenks/python-wordsegment text = ' '.join(wordsegment.segment(emoji.demojize(text))) # NULI replaced URL with http text = text.replace('URL', 'http') # NULI limited @USER to three instances text = text.split() user_count = 0 out_text = [] for word in text: if (word == '@USER'): user_count += 1 else: user_count = 0 if (user_count <= 3): out_text.append(word) text = ' '.join(out_text) testTweets.append(tweet) x_test.append(text) return x_train, y_train, x_test, labelNum, testTweets, labelsAsNums, numsAsLabels, max_sequence_length
from wordsegment import load, segment load() result = segment('thisisatest') print(result)
sent_files = sys.argv[1:] for sent_file in sent_files: r = open(sent_file, "r") s = open(sent_file + "_sensitive.list", "w", buffering=0) if not os.path.isfile(sent_file): print(sent_file + " isn't exist!") continue s.write("\n" + sent_file + "\n") # read lines for raw_line in r.readlines(): sentence = "" segmentation = [] line = ' '.join(segment(raw_line)) if len(line.split()) < 2: continue line = line.replace(".", "").strip() print("\nRAW: " + line) # replace the abbreviation in dicts of sentences for word in line.split(' '): replace_flag = 0 for words_abbr in WORDS_ABBR.keys(): if word == words_abbr or (words_abbr in word and WORDS_ABBR[words_abbr] not in line): line = line.replace(words_abbr, " " + WORDS_ABBR[words_abbr] +
# Copyright (C) 2017 Yerai Doval # You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/gpl.txt> import wordsegment as ws import nltk import pickle import sys def identity(s): return s def load_obj(name): with open(name, 'rb') as f: return pickle.load(f) if len(sys.argv) == 3: ws.UNIGRAMS = load_obj(sys.argv[1]) ws.BIGRAMS = load_obj(sys.argv[2]) ws.TOTAL = float(sum(ws.UNIGRAMS.values())) ws.clean = identity for line in sys.stdin: line = line.replace("\n", "") seg = " ".join(ws.segment(line)) print(line + "\t" + " ".join(nltk.word_tokenize(seg)).replace( "``", "\"").replace("\'\'", "\"") + "\t N/A")
@Licence : This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ''' import CMUTweetTagger as cmu import wordsegment as ws file1 = open() file2 = open() data1 = file1.read() data2 = file2.read() tweets1 = data1.split("\n\n") hashtags = [] for tweet1 in tweets1: hashtag = tweet1.split("\n")[0] hashtags.append(" ".join(ws.segment(hashtag))) postags = cmu.runtagger_parse(hashtags) i=0 for postag in postags: if '$' in "".join(postag): i+=1
def load_dataset(trFile=None, teFile=None): labelsAsNums = {} numsAsLabels = {} labelNum = 0 numTweets = 0 testTweets = [] x_train = [] y_train = [] x_test = [] max_sequence_length = -1 wordsegment.load() if (trFile): #load in train tweets and corresponding labels with open(trFile, 'r') as csvfile: tweetreader = csv.reader(csvfile, delimiter='\t') for tweet in tweetreader: text = tweet[1].strip() #replace any non standard characters with standard #text = convertChars(text) # 'The symbols ’@’ and ’#’ were excluded from the list due to their specific semantics in tweets.' - vradivchev text = text.replace('#', '') text = text.replace('@', '') # 'All occurrences of tokens beginning with a hashtag were split into the separate words # comprising the token, provided that each separate word is uppercased ' - vradivchev text = ' '.join(wordsegment.segment(text)) # 'Afterwards the tweets were subjected to tokenization and lowercasing' - vradivchev text = " ".join(nltk.word_tokenize(text)) text = text.lower() # 'Afterwards we proceeded with removing a variety of different stop words' - vradivchev stop_words = set(stopwords.words('english')) text = ' '.join( [w for w in text.split() if not w in stop_words]) x_train.append(text) if (len(text.split()) > max_sequence_length): max_sequence_length = len(text.split()) if tweet[2] not in labelsAsNums: labelsAsNums[tweet[2]] = labelNum numsAsLabels[str(labelNum)] = tweet[2] labelNum += 1 y_train.append(labelsAsNums[tweet[2]]) #load in test tweets and corresponding labels if (teFile): with open(teFile, 'r') as csvfile: tweetreader = csv.reader(csvfile, delimiter='\t') for tweet in tweetreader: text = tweet[1].strip() text = convertChars(text) # 'The symbols ’@’ and ’#’ were excluded from the list due to their specific semantics in tweets.' - vradivchev text = text.replace('#', '') text = text.replace('@', '') # 'All occurrences of tokens beginning with a hashtag were split into the separate words # comprising the token, provided that each separate word is uppercased ' - vradivchev text = ' '.join(wordsegment.segment(text)) # 'Afterwards the tweets were subjected to tokenization and lowercasing' - vradivchev text = " ".join(nltk.word_tokenize(text)) text = text.lower() # 'Afterwards we proceeded with removing a variety of different stop words' - vradivchev stop_words = set(stopwords.words('english')) text = ' '.join( [w for w in text.split() if not w in stop_words]) testTweets.append(tweet) x_test.append(text) return x_train, y_train, x_test, labelNum, testTweets, labelsAsNums, numsAsLabels, max_sequence_length
temp = [float(embed) for embed in temp] all_vector.append(temp) vector_file.close() new_word2vec300 = {} for i in range(len(all_word_unique)): new_word2vec300[all_word_unique[i]] = all_vector[i] pickle.dump(new_word2vec300, open('new_word2vec300.pickle', 'wb')) new_word2vec300 = pickle.load(open('new_word2vec300.pickle', 'rb')) # max_count = 0 tweet_embedding = [] for tweet in tweet_list: # temp = tweet.split(" ") temp = segment(tweet.replace('@USER', '')) sentence_embed = np.zeros(300) word_count = 0 for word in temp: try: # sentence_embed += word2vec300[word] sentence_embed += new_word2vec300[word] word_count += 1 except: pass # if word_count > max_count: # max_count = word_count # print(tweet) tweet_embedding.append(sentence_embed / word_count) tweet_embedding_b = np.array(tweet_embedding)[~pd.isnull(subtask_blist)]
def test_segment_12(): assert segment('faroutintheunchartedbackwatersoftheunfashionableendofthewesternspiralarmofthegalaxyliesasmallunregardedyellowsun') == ['far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the', 'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of', 'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun']
def split(word): return jsonify(result=segment(word))
def test_segment_0(): assert segment('choosespain') == ['choose', 'spain']
def test14(parsedTag, postag): nounpart = [] k = 0 ret = [] splitline = parsedTag.split() for x in postag: if (x is 'M' or x is '^' or x is 'Z'): nounpart.append(splitline[k]) k += 1 if " ".join(nounpart) == "": ret.append(2) while True: try: googledata = searchWeb.searchgoogle(parsedTag) #gets all the urls for the hashtag on google search break except: continue count = 0 i = 1 for site in googledata: try: if searchWeb.searchforstring(site, nounpart): #checks if the hashtag noun parts are popular by counting the number of websites they are present count += 1 except: pass i += 1 if i > 10: break if count > 5: ret.append(1) else: ret.append(0) seg = parsedTag.split() m = [] for n in seg: m.append(stemming.porter2.stem(n)) seg = " ".join(m) if socialListProxy: proxy = ulib.ProxyHandler({ 'http': socialListHttp_Proxy, 'https': socialListHttps_Proxy }) opener = ulib.build_opener(proxy) ulib.install_opener(opener) counter = 0 total = 0 for site in googledata: req = ulib.Request(site, headers={'User-Agent': "Mozilla/5.0"}) site = segment(site) l = [] for j in site: l.append(stemming.porter2.stem(j)) site = " ".join(l) try: content = ulib.urlopen(req) x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read()) #searches for a match of hastag in the title and url of every page t = [] for s in x: t.append(stemming.porter2.stem(s)) t = " ".join(t) if ((seg in site) or (seg in t)): counter = counter + 1 total = total + 1 except: pass if (total == 10): ret.append("%.4f" % (float(counter) / total)) if (total == 20): ret.append("%.4f" % (float(counter) / total)) break if total < 10: ret.append("%.4f" % (float(counter) / 10.0)) ret.append("%.4f" % (counter / 20.0)) elif total < 20: ret.append("%.4f" % (float(counter) / 20.0)) return ret
def test_segment_1(): assert segment('thisisatest') == ['this', 'is', 'a', 'test']
def performSegmentation(word): flag = word.endswith('.') new_word = " ".join(segment(word)) if flag: new_word += '.' return new_word
import wordsegment from wordsegment import segment segment('thisisatest')