Example #1
0
 def __init__(self,grammar,tags):
     self.lines = []
     repeat_line_array = nltk.word_tokenize(grammar.gen_frame_line(grammar.cfg.start()))
     x = random.randint(0,8)
     y = random.randint(0,8)
     for i in range(8):
         if (i == x or i == y):
             spot_array = []
             j = 0
             noun_set = set(['he','she','it','I'])
             for wop in repeat_line_array:
                 if wop in set(tags):
                     spot = Spot(wop,i,j,'POS')
                     if (wop in noun_set):
                         spot.add_POS('NN')
                     spot_array.append(spot)
                 else:
                     spot = Spot(wop,i,j,'word')
                     spot_array.append(spot)
                 j += 1
             self.lines.append(spot_array)
         else:
             line_array = nltk.word_tokenize(grammar.gen_frame_line(grammar.cfg.start()))
             spot_array = []
             j = 0
             for wop in line_array:
                 if wop in set(tags):
                     spot = Spot(wop,i,j,'POS')
                     spot_array.append(spot)
                 else:
                     spot = Spot(wop,i,j,'word')
                     spot_array.append(spot)
                 j += 1
             self.lines.append(spot_array)
Example #2
0
def jaccard(a,b):
    A,B=nltk.word_tokenize(a),nltk.word_tokenize(b)
    A,B = set(A),set(B)
    intersect = len(A.intersection(B))
    union = len(A.union(B))
    coef = float(intersect)/union
    return coef
Example #3
0
def build_s(data):
    '''
    Compute the context vector for each lexelt
    :param data: dic with the following structure:
        {
			lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
			...
        }
    :return: dic s with the following structure:
        {
			lexelt: [w1,w2,w3, ...],
			...
        }

    '''
    s = {}
    for lexelt, lexelt_info in data.items():
        words = set()
        for (instance_id, left_context, head, right_context, sense_id) in lexelt_info:
            left_tokens = nltk.word_tokenize(left_context)
            right_tokens = nltk.word_tokenize(right_context)
            words.update(k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size))

        s[lexelt] = list(words)

    return s
Example #4
0
    def load_file_without_frequency(self,positif, negatif):
        tab = []
        maxs = self.nbFeatures
        phrases = []
        y = []
        with codecs.open(positif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(1)
		        for mot in word_tokenize(line):
		            tab.append(mot)
        with codecs.open(negatif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(0)
		        for mot in word_tokenize(line):
		            tab.append(mot)
        word_fd = FreqDist(tab)
        print(word_fd)
        for i in range(len(phrases)):
		    mots = word_tokenize(phrases[i])
		    tmp  = []
		    for element in mots:
		        tmp.append(word_fd[element])
		    if(len(tmp) < maxs):
		        for j in range(maxs - len(tmp)):
		            tmp.append(0)
		    elif(len(tmp)>maxs):
		            tmp = tmp[:maxs]
		    phrases[i] = tmp
        return (np.array(phrases),np.array(list(set(tab))),np.array(y))
Example #5
0
def vectorize(data, s):
    '''
    :param data: list of instances for a given lexelt with the following structure:
        {
			[(instance_id, left_context, head, right_context, sense_id), ...]
        }
    :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
    :return: vectors: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }
            labels: A dictionary with the following structure
            { instance_id : sense_id }

    '''

    vectors = {}
    labels = {}
    for (instance_id, left_context, head, right_context, sense_id) in data:
        labels[instance_id] = sense_id
        left_tokens = nltk.word_tokenize(left_context)
        right_tokens = nltk.word_tokenize(right_context)
        words = k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size)
        vectors[instance_id] = frequency_vector_from_near_words(s, words)

    return vectors, labels
def tester():
    tweetList = readPickleFile(pickledTweets)
    resultList = readTextFile(cleanTweets)

    count1 = 0
    count2 = 0
    precision = 0
    
    for tweet, result in zip(tweetList, resultList):
       
        count1 = count1 +1
        print(tweet)
        extract = extraction.runner(tweet)
        
        extractTokens = nltk.word_tokenize(extract)
        resultTokens = nltk.word_tokenize(result)
        
        precisionList = [word for word in resultTokens if word in extractTokens]
        if len(precisionList) == len(resultTokens):
            precision = precision + 1
        
        print('Exrt: ' + extract)
        print('Cort: ' + result)

        
        if (extract.strip() == result.strip()):
            count2 = count2 +1
            
    print('Precision: ' + str(precision) + ' out of ' + str(count1) + ' or ' + str(float("{0:.2f}".format(precision/float(count1)))) + '% of retrieved instances are relevant (positive predictive value)')
    print('Recall: ' + str(count2) +' out of ' + str(precision) + ' or ' + str(float("{0:.2f}".format(count2/float(precision)))) + '% of the relevant instances are retrieved (sensitivity)')
Example #7
0
 def __init__(self, title, full_text, sentence):
     self.title = title
     self.sentence = sentence
     # map of word -> number of times it appears in the full article text
     self.full_text_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(full_text))
     # map of word -> number of times it appears in the given sentence
     self.sentence_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(sentence))
def tfidfifyAwards():
    """This returns a dictionary of words that are used in awards along with a score that reflects how useful they are in identifying award names"""
    WordFreq = Counter()
    for award in OFFICIAL_AWARDS:
        award = cast_to_syn(award)
        aTokens = nltk.word_tokenize(award)
        for word in aTokens:
            WordFreq[word] += 1

    DocFreq = Counter()
    for award in OFFICIAL_AWARDS:
        award = cast_to_syn(award)
        aTokens = nltk.word_tokenize(award)
        for word in WordFreq:
            if word in aTokens:
                DocFreq[word] += 1
    retDict = dict((el, 0.0) for el in DocFreq)

    for word in DocFreq:
        retDict[word] = 1.0 / (DocFreq[word] ** (3.0 / 4.0))
    """for word in media_words:
		retDict[word] = retDict[word]*1.5
	for word in genre_words:
		retDict[word] = retDict[word]*1.5
	"""
    return retDict
Example #9
0
    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
Example #10
0
def POS_Ngram(N, example_set, i):
    N_grams = dict()
    count = 0
    for para in example_set:
        if i == 0: # get first sentence
            tokens = word_tokenize(para.first)
        else: # get ith sentence
            para.order_sentence()
            tokens = word_tokenize(para.ordered_sentences[i-1])
            #tokens = word_tokenize(para.scrambled_sentences[int(para.correct_order[i-1])-1])
        tagset = None
        #print(tokens)
        tokens = _pos_tag(tokens, tagset, tagger)

        tags = [x[1] for x in tokens] # take POS tags only

        n_tags = list(ngrams(tags, N))

        for tag_set in n_tags:
            count += 1
            if tag_set in N_grams:
                N_grams[tag_set] += 1
            else:
                N_grams[tag_set] = 1 # first occurence of tagset
    # Normalize N_gram counts by total number of N grams for this set of sentences
    for ngram, num in N_grams.items():
        N_grams[ngram] = num/count
    return N_grams
Example #11
0
def colocation(windowSize, pos, context,dictionary):
    if windowSize<=0:
        return dictionary
    #going forward
    forward= context[:(pos)]
    f= forward[(-windowSize/2):]
    #going backward    
    backward= context[pos+1:]
    b= backward[:windowSize/2]
    for item in f:
        key= "pre"+str(len(f)-f.index(item))+"-word"
        value= item
        dictionary[key]=value
        key= "pre"+str(len(f)-f.index(item))+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    for item in b:
        key= "fol"+str(b.index(item)+1)+"-word"
        value= item
        dictionary[key]=value
        key= "fol"+str(b.index(item)+1)+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    return dictionary
def extract_pos_pair(event_mention_1, event_mention_2):
    trigger1=""
    extent1=""
    trigger2=""
    extent2=""
    for one_anchor in event_mention_1.findall("anchor"):
        trigger1=one_anchor[0].text
    for one_anchor in event_mention_2.findall("anchor"):
        trigger2=one_anchor[0].text
    for one_extent in event_mention_1.findall("extent"):
        extent1=one_extent[0].text
    for one_extent in event_mention_2.findall("extent"):
        extent2=one_extent[0].text
    text1 = nltk.word_tokenize(extent1)
    dict1 = nltk.pos_tag(text1)
    for one_pair in dict1:
        if one_pair[0] in trigger1 or trigger1 in one_pair[0]:
            pos1=one_pair[1]
            break
    text2 = nltk.word_tokenize(extent2)
    dict2 = nltk.pos_tag(text2)
    for one_pair in dict2:
        if one_pair[0] in trigger2 or trigger2 in one_pair[0]:
            pos2=one_pair[1]
            break
    return (pos1, pos2)
Example #13
0
    def update(self, other):
        """Adds counts for elements in other"""
        if isinstance(other, self.__class__):
            self.n_sents += other.n_sents
            for x, n in other.items():
                self[x] += n
        else:
            for sent in other:
                self.n_sents += 1

                # import pdb;pdb.set_trace()
                if self.poscache is not None:
                    if sent in self.poscache:
                        tags = self.poscache[sent]
                    else:
                        self.poscache[sent] = tags = nltk.pos_tag(
                            nltk.word_tokenize(sent))
                else:
                    tags = nltk.pos_tag(nltk.word_tokenize(sent))

                for x in tags:
                    tok, tag = x
                    self[tag] += 1

            if self.normalize:
                for x, n in self.items():
                    self[x] /= float(self.n_sents)
Example #14
0
def read_liveqa(prefix = '../data/qalab-liveqa/dataset/qrels/', train = 'LiveQA2015-ver2.qrels', tokenize = True):
	import nltk

	f = open_file(prefix + train)
	np.random.seed(0)

	data_split = {0: [], 1 : [], 2 : []}
	ref_split = {0: [], 1 : [], 2 : []}

	for i,line in enumerate(f):
		l = line.strip().split('\t')
		if l[2] == '':
			first = " ? ".join(l[3].strip().split("?"))
			second = " . ".join(first.strip().split("."))
			q = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ')
			split_id = np.random.choice([0,0,0,1,2])
			continue
		label = int(l[2]) >= 3

		first = " ? ".join(l[3].strip().split("?"))
		second = " . ".join(first.strip().split("."))
		a = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ')
		data_split[split_id] += [(q,a,label,'','')]
		ref_split[split_id] += [(l[0],'0',l[0]+'_'+l[1]+'_'+str(i),str(int(label)))]

	return data_split[0],data_split[1],data_split[2],(ref_split[0],ref_split[1],ref_split[2])
Example #15
0
def reading_level(full_text):
    #Clean the full_text
    full_text_clean = ""
    for char in full_text:
        if char == ".":
            full_text_clean += ". "
        else:
            full_text_clean += char

    #Language features
    import nltk
    words = nltk.word_tokenize(full_text_clean)

    n_sents = len(nltk.sent_tokenize(full_text_clean))
    n_words = len(nltk.word_tokenize(full_text_clean))

    #Count the syllables
    n_syll = 0
    for word in words:
        n_syll += syllable_count(word)

    #Calculate the reading level
    #https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

    grade_level = -15.59 + 0.39*(n_words/n_sents) + 11.8*(n_syll/n_words)
    return round(grade_level,1)
Example #16
0
def tag_chapter(chapter):
	text_raw = open("ofk_ch" + str(chapter) + ".txt").read()
	bad = ["ca", 'wo', 'thelmselves']
	tokens = nltk.word_tokenize(text_raw)
	# tokens = [w.lower() for w in tokens] #change to lower case
	tokens = [re.sub('\.','',w) for w in tokens] #remove periods
	tokens = [w for w in tokens if w.isalpha()] #just keep words
	tokens = [w for w in tokens if not w in stopwords.words('english')]
	tokens = [w for w in tokens if len(w) > 1]
	tokens_freq = FreqDist(tokens)
	tokens_10 = [w for w in tokens if tokens_freq[w] > 20]
	tokens_10 = [w for w in tokens_10 if w not in bad]
	tokens_freq = FreqDist(tokens_10)

	tokens_table = [(w, importance(w, tokens_freq, tokens_10, text1, text2, text3, text4, text5, text6, text7, text8, text9)) for w in tokens_freq]

	a = lambda e1, e2: int(1000000*(e1[1] - e2[1]))

	sorted_table = sorted(tokens_table, cmp = a, reverse=True)
	# The number of elements you want to dump
	nums = 20
	final_table = sorted_table[:nums];
	pos_tuples = []
	for x in final_table:
		token = nltk.word_tokenize(x[0])
		pos_tuples = pos_tuples + nltk.pos_tag(token)
	list_pos = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
	pos_names = ['Conjunction', 'Numerical', 'Determiner', 'Existential There', 'Foreign Word', 'Preposition or Conjunction, Subordinating', 'Adjective or Numeral', 'adjective, comparative', 'adjective, superlative', 'List Item Marker', 'Modal Auxiliary', ' Noun, Common', 'Noun, Proper, Singular', 'Noun, Proper, Plural', 'Noun, Common, Plural', 'Pre-Determiner', 'Genitive Marker', 'Pronoun, Personal', 'Pronoun, Possessive', 'Adverb', 'Adverb, Comparitive', 'Adverb, Superlative', 'Particle', 'Symbol', '"to" as preposition', 'Interjection', 'Verb, Base Form', 'Verb, Past Tense', 'Verb, Present Participle', 'Verb, Past Participle', 'Verb, Present Tense, Not 3rd Person Singular', 'Verb, Present Tense, 3rd Person Singular', 'WH-Determiner', 'WH-Pronoun', 'WH-Pronoun, Possessive', 'WH-Adverb']
	result = [None] * len(list_pos)
	for p in range(len(list_pos)):
		result[p] = [dict(name=w[0]) for w in pos_tuples if w[1] == list_pos[p]]
	result_dict = [dict(name=pos_names[n], children=result[n]) for n in range(len(list_pos)) if result[n]]
 	return result_dict
def parseFile(file):
	""" Parse the header and source files for the class, and return the bindings dictionary, which contains tag data (and other pertinent 
		information about the file)
	"""
	#print file
	
	bindings 	= []
	
	
	# Load header file
	tokens 		= []
	if (file['header'] != ''):
		with open(file['header'], 'r') as f:
			# Tokenize
			for line in f.readlines():
				tokens += nltk.word_tokenize(line)
	
	# Parse tokens
	bindings += parseTokens( tokens, file, 'header' )

	
	# Load source file
	tokens 		= []
	if (file['source'] != ''):
		with open(file['source'], 'r') as f:
			# Tokenize
			for line in f.readlines():
				tokens += nltk.word_tokenize(line)
	
	# Parse tokens
	bindings += parseTokens( tokens, file, 'source' )	
	
	return bindings
def tokenize(instances, lowercase=False):
    if lowercase:
        tokens = [nltk.word_tokenize(i.lower()) for i in instances]
    else:
        tokens = [nltk.word_tokenize(i) for i in instances]

    return tokens
Example #19
0
def get_tasty(local_id):
	local = fs.local_info(local_id)
	reviews = local[3]
	menu = local[4]
	r_scores = score_reviews(reviews)
	m_items = fs.process_menu(menu)
	toreturn = []
	r_reviews = []
	for review in r_scores:
		(text, s) = review
		text_w = eliminate_stop_words(nltk.word_tokenize(text))
		item_scores = []
		for item in m_items:
			try:
				(name,desc,price) = item
			except ValueError:
				return ([(" $ "," ")],[(" "," $ ")])
			text_m = eliminate_stop_words(nltk.word_tokenize(name)+nltk.word_tokenize(desc))
			score = similarity(text_w, text_m)
			item_scores.extend( [( name+' $'+str(price) ,score)] )
		item_refered = max(item_scores, key = operator.itemgetter(1))
		if item_refered[1]>1 and s>0:
			toreturn.append( item_refered[0] )
			r_reviews.append( (text, item_refered) )
	counter = dict([(item, 0) for item in toreturn])
	for item in toreturn:
		counter[item] += 1
	counted_items = [ (item, counter[item]) for item in toreturn]
	print counted_items
	return (counted_items ,r_reviews )
Example #20
0
def checkTypeWordCount(answer,question):
    count = 0
    status = ''
    sum = 0
    status1 = 'false'

    for word1 in word_tokenize(answer):
        if word1 == '.' or word1 == ',' or word1 == '\'' or word1 == '\"' or word1 == ':' or word1 == ';' or word1 == '?' or word1 == '/' or word1 == '\\' or word1 == '|' or word1 == ']' or word1 == '[' or word1 == '}' or word1 == '{' or word1 == '(' or word1 == ')' or word1 == '*' or word1 == '&' or word1 == '^' or word1 == '%' or word1 == '$' or word1 == '#' or word1 == '@' or word1 == '!' or word1 == '`' or word1 == '~' or word1 == '-' or word1 == '_' or word1 == '='or word1 == '+':
            print 'error'
        else:
            sum = sum +1
            #print word1
    print sum

    words_ans = word_tokenize(answer)
    words_qus = word_tokenize(question)
    if words_ans[0]=="NOTICE"or words_ans[0]=="Notice":
        print "Correct"
        count = count+0.25
    else:
        status = "Wrong"

    for word in words_qus:
        if en.is_number(word) and words_qus[words_qus.index(word)+1]== 'words':
            if sum >= word:
                print word
                count = count+0.25
            status1='true'

    if status1 == 'false':
        count = count+0.25
    return count,status
Example #21
0
	def test(self):
	
		test_vector = []
		test_emb = []
		cList = []
		
		reviews = ET.parse(self.test_path).getroot().findall('Review')
		for review in reviews:
			sentences = review[0] #get the sentences
			for sentence in sentences:
				if (len(sentence) > 1):
					opinions = sentence[1]
					
					if ( len(opinions) > 0): #check if there are aspects 
						t = sentence[0].text
						
						text = word_tokenize(t.lower())
						textC = word_tokenize(t) #tokenize, check for caps
						
						for opinion in opinions:
							
							test_emb.append(text) #store the tokenized words for the embedding's calculation
		
		centroid = self.calcCentroid(self.model,test_emb) #caclulate the centroid for each sentence	
		
		for i in range(len(centroid)): #join the matrices
			tmp = centroid[i].tolist()
			test_vector.append(tmp)
		
		print
		print '---- End of Test ----'

		return test_vector
Example #22
0
def nltk_filter(sent):
  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1            = b1.lower()
  tokens        = word_tokenize(b1)
  pos_tags      = pos_tag(tokens)
  filtered_sent = ' '
  for token in tokens:
    filtered_sent += '1'+token + ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

#note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2            = b2.lower()
  tokens        = word_tokenize(b2)
  pos_tags      = pos_tag(tokens)
  # filtered_sent = ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  for token in tokens:
    filtered_sent += '2' + token + ' '

  return filtered_sent
Example #23
0
def main(question, article):
  ddict = {}
  counts = get_counts()
  for tok in nltk.word_tokenize(article):
    ddict[tok] = ddict.get(tok, 0) + 1

  vec = []
  for tok in nltk.word_tokenize(question):

    # count in article
    tf = ddict.get(tok, 0) 

    # total articles is 108 / number that have current token
    idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
    vec.append(tf*idf)

  largest = max(vec)
  normalized = map(lambda y: y/largest, vec)

  finDic = {}
  for word,i in enumerate(nltk.word_tokenize(question)):
    finDic[word] = normalized[i]

  print finDic
  return finDic
Example #24
0
def PushDataPair(data, database):
        last = len(database['Q'].keys())
        for pair in data:
                database['Q'][last] = nltk.word_tokenize(pair['question'])
                database['A'][last] = nltk.word_tokenize(pair['answer'])
                last += 1
        return database
def build_s(data):
    '''
    Compute the context vector for each lexelt
    :param data: dict with the following structure:
        {
			lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
			...
        }
    :return: dict s with the following structure:
        {
			lexelt: [w1,w2,w3, ...],
			...
        }

    '''
    s = {}

    # implement your code here
    for key,value in data.items():
      for i in value:
        tokens_left = nltk.word_tokenize(i[1])
        tokens_right = nltk.word_tokenize(i[3])
        left = [w for w in tokens_left if w not in string.punctuation][-window_size:]
        right = [w for w in tokens_right if w not in string.punctuation][:window_size]
        context = left + right
        if key not in s:
          s[key]=[]
        for word in context:
          if word not in s[key]:
            s[key].append(word)            
          
    return s
Example #26
0
def synsym(s1,s2):
    ts0 = nltk.pos_tag(nltk.word_tokenize(s1))
    ts1 = nltk.pos_tag(nltk.word_tokenize(s2))
    # adj  
    jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS']
    jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS']
    if len(jj0) == 0 or len(jj1) ==0:
      jjps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # noum  
    jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    if len(jj0) == 0 or len(jj1) ==0:
      nps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      nps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # verb
    jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    if len(jj0) == 0 or len(jj1) ==0:
      vps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      vps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))    
    return [jjps,nps,vps]
Example #27
0
def paragraph_features(paragraph_sents):
    global count
    count += 1
    print '\r', count,

    if FEATURE == FEAT_CONTAINS:
        paragraph_words = set(
            sents_to_words(paragraph_sents)
        )
    elif FEATURE == FEAT_LINKED_TITLES:
        paragraph_words = ' '.join(paragraph_sents)
    elif FEATURE == FEAT_FIRST_SENT:
        paragraph_words = nltk.word_tokenize(
            paragraph_sents[0]
        )
    elif FEATURE == FEAT_BEGIN_SENT:
        paragraph_words = {
            nltk.word_tokenize(sent)[0]
            for sent in paragraph_sents
        }
    else:
        paragraph_words = None
        print 'FEATURE NOT SUPPORTED'
        exit()

    features = dict()
    for word in word_features:
        features[word_features[word]] = (
            word in paragraph_words
        )

    return features
Example #28
0
def next_note(tokenizer):
    print 'SemEval data'
    for semeval_file in semeval_files:
        print 'File', semeval_file
        with open(semeval_file, 'r') as f:
            st = []
            for line in f:
                st += [line.strip()]
            text = read_visit_sem(st)
            text = tokenizer.tokenize(text)
            for sent in text:
                yield nltk.word_tokenize(sent.lower())
    print 'MIMIC data'
    for notes_file in subset(notes_files, 15): # 15 random MIMIC files
        print 'File', notes_file
        try:
            with open(notes_file, 'r') as f:
                ct = 0
                st = []
                for line in f:
                    ct += 1
                    if ct % 50000 == 0:
                        print ct
                    if line.strip() == '</VISIT>':
                        text = read_visit(st)
                        text = tokenizer.tokenize(text)
                        for sent in text:
                            yield nltk.word_tokenize(sent.lower())
                        st = []
                    elif line.strip() != '<VISIT>':
                        st += [line.strip()]
        except IOError:
            pass
Example #29
0
File: A.py Project: keyu-lai/NLP
def build_s(data):
    """
    Compute the context vector for each lexelt
    :param data: dic with the following structure:
        {
			lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
			...
        }
    :return: dic s with the following structure:
        {
			lexelt: [w1,w2,w3, ...],
			...
        }

    """
    s = {}

    # implement your code here

    for lexelt in data:
        words = set()
        for instance in data[lexelt]:

            left_context = word_tokenize(instance[1].strip())
            for token in left_context[-window_size:]:
                if token not in puncts:
                    words.add(token)

            right_context = word_tokenize(instance[3].strip())
            for token in right_context[:window_size]:
                if token not in puncts:
                    words.add(token)
        s[lexelt] = list(words)

    return s
Example #30
0
def stanford_corenlp_filter(sent):
  from nltk.tag.stanford import POSTagger
  posTagger = POSTagger('/Users/gt/Downloads/'
                        'stanford-postagger-2013-06-20/models/'
                        'wsj-0-18-bidirectional-nodistsim.tagger',
                        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
                        '/stanford-postagger-3.2.0.jar',encoding=encoding)

  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1 = b1.lower()
  tokens = word_tokenize(b1)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

      #note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2 = b2.lower()
  tokens = word_tokenize(b2)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  return filtered_sent
Example #31
0

def getCategories():
    return categories


# Uma lista de tuplas com as palavras das sentenças e o nome da categoria
docs = []

for each_category in data.keys():
    for each_sentence in data[each_category]:
        # Remove a pontuação
        each_sentence = remove_punctuation(each_sentence)
        print(each_sentence)
        # Extrai as palavras de cada sentença e armazena na lista
        w = nltk.word_tokenize(each_sentence)
        print("\nPalavras tokenizadas: ", w)
        words.extend(w)
        docs.append((w, each_category))

# Stem de cada palavra, converte para minúsculo e remove duplicidades
words = [stemmer.stem(w.lower()) for w in words]
words = sorted(list(set(words)))

# Cria as listas para os dados de treino
training = []
output = []

# Cria um array para o output
output_empty = [0] * len(categories)
testMusicians = list()
for genre in testLists:
    musicians = get_musicians(genre)
    for musician in musicians:
        testMusicians.append(musician) 

all_text = ''
for entry in trainingMusicians:
    text = get_page_text(entry[1])
    entry[1] = text
    all_text += text
striptext = all_text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
documents = [entry[1] for entry in trainingMusicians]
sentences = sent_tokenize(striptext)
words = word_tokenize(striptext)
texts = [[word for word in document.lower().split()
        if word not in STOPWORDS and word.isalnum()]
        for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lsi = models.LsiModel(corpus, id2word=dictionary)

for entry in testMusicians:
    text = get_page_text(entry[1])
    vec_bow = dictionary.doc2bow(text.lower().split())
    vec_lsi = lsi[vec_bow]
    index = similarities.MatrixSimilarity(lsi[corpus])
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    entry[1] = trainingMusicians[sims[0][0]][0]
Example #33
0
    def SentenceAnalysis(self, fulltext, textfdist):
        #debug_print("Answer.SentenceAnalysis(_,_)", level=5)
        ans_sentencelist = []
        # Perform text normalization, while preserving offsets
        text = fulltext.replace('\n', ' ')
        self.ans_text = text
        # Separate text into sentences
        # TODO: See if NLTK sentence tokenizer works better
        ## OLD: p = re.compile(r'.+\.')
        p = re.compile(r'([\w\"\'\<\(][\S ]+?[\.!?])[ \n\"]')
        ## OLD: keysen = p.findall(text)
        # # Commented bellow logic as regex is not working properly..
        # # offset = 0
        # # keysen = []
        # # starts = []
        # # ends = []
        # # while (len(text) > 0):
            # # match = p.search(text)
            # # if not match:
                # # break
            # # keysen.append(match.group(0))
            # # starts.append(offset + match.start(0))
            # # ends.append(offset + match.end(0))
            # # text = text[match.end(0) : ]
            # # offset += match.end(0)
        keysen = sent_tokenize(text)
        remove_point_patt = re.compile(r"^\d.")
        keysen = [x for x in keysen if not remove_point_patt.search(x.strip())]
        # Create hash entries for each sentence
        sen_no = 0
        for sen in keysen:
            #debug_print("sen: " + str(sen), level=6)
            sen_no += 1
            # Tokenize text, part-of-speech tag, derive WordNet base word (lemma), and then add information for words found.
            # Note: An optional part-of-speech tag prefix can be included.
            # TODO: Isolate text preprocessing code in a separate function
            sen_text = remove_latex(sen)
            text = nltk.word_tokenize(sen_text)
            part_of_speech_tagged_words =  nltk.pos_tag(text)
            text_words = list(nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words)
            text_words_proper = list(word for word in text_words if word)
            if self.use_part_of_speech:
                # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car'])
                text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word
                                     for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words)
                                     if word]
            ans_sentencelist.append({'StuS': sen,
                                     'StuWords': text_words_proper,
                                     'No': sen_no})

        # Compute TF/IDF-style weighting scheme
        for sentence in ans_sentencelist:
            #debug_print("sentence: " + str(sentence), level=6)
            fdist = nltk.FreqDist(sentence['StuWords'])
            try:
                max_freq = max([f for f in fdist.values()])
            except ValueError:
               #print_stderr("Exception in Answer.SentenceAnalysis: " + str(sys.exc_info()))
                max_freq = 1
            log_max_freq = math.log(max_freq) if (max_freq > 1) else 1
            senvec = {}
            for word in sorted(textfdist):
                if fdist[word]:
                    wordfreq = sum(1 for senten in ans_sentencelist if word in senten['StuWords'])
                    if (self.use_true_tf_idf):
                        tf = 1 + math.log(fdist[word]) / log_max_freq
                        idf = 1 + math.log(len(keysen) / wordfreq)
                        senvec[word] = tf * idf
                    else:
                        senvec[word] = (1 + math.log(2.0 * fdist[word])) * math.log(2.0 * len(keysen) / wordfreq)
                else:
                    senvec[word] = 0
            sentence['StuSVec'] = senvec
        ##debug_print("Answer.SentenceAnalysis(%s,_) => %s" % (str(fulltext), str(ans_sentencelist)), level=6)
        ##debug_print("\t_ [textfdist]: %s" % str(textfdist), level=7)
        return ans_sentencelist
Example #34
0
try:
    with open('model/data.pickle', 'rb') as f:
        words, labels, training, output = pickle.load(f)

# Creating an empty list to store some values.
except:
    words = []
    labels = []
    docs_x = []
    docs_y = []

    # Creating a loop that would stem the words in the json dataset,
    # and append them into the list created above.
    for intent in data['intents']:
        for pattern in intent['patterns']:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            docs_x.append(wrds)
            docs_y.append(intent['tag'])

        # Creating an if statement to append the word that are not present in,
        # the labels list into the label list.
        if intent['tag'] not in labels:
            labels.append(intent['tag'])

    # Stemming the words and converting them into lowercase alphabets,
    # then setting an if statement to remove the ? character.
    words = [stemmer.stem(w.lower()) for w in words if w != "?"]
    words = sorted(list(set(words)))

    # Sorting the value for the words in labels and saving them into a
def RingNormalize(arrow):
    return RingPrecious(
        nltk.word_tokenize(arrow.lower().translate(remove_dot_ring)))
nltk.download()

# In[ ]:

import nltk
import numpy as np
import random
import string  #To process standard python strings

# In[ ]:

f = open('/Users/Henry/lordofthering.txt', 'r', errors='ignore')
raw = f.read()
raw = raw.lower()  # converts to lowercase
sent_precious = nltk.sent_tokenize(raw)  # converts to list of sentences
word_precious = nltk.word_tokenize(raw)  # converts to list of words

# In[ ]:

ring = nltk.stem.WordNetLemmatizer()


#WordNet is a semantically-oriented dictionary of English included in NLTK.
def RingPrecious(swords):
    return [ring.lemmatize(sword) for sword in swords]


remove_dot_ring = dict((ord(dot), None) for dot in string.punctuation)


def RingNormalize(arrow):
nltk.download()
paragraph = """I have three visions for India. In 3000 years of our history, people from all over 
               the world have come and invaded us, captured our lands, conquered our minds. 
               From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,
               the French, the Dutch, all of them came and looted us, took over what was ours. 
               Yet we have not done this to any other nation. We have not conquered anyone. 
               We have not grabbed their land, their culture, 
               their history and tried to enforce our way of life on them. 
               Why? Because we respect the freedom of others.That is why my 
               first vision is that of freedom. I believe that India got its first vision of 
               this in 1857, when we started the War of Independence. It is this freedom that
               we must protect and nurture and build on. If we are not free, no one will respect us.
               My second vision for India’s development. For fifty years we have been a developing nation.
               It is time we see ourselves as a developed nation. We are among the top 5 nations of the world
               in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling.
               Our achievements are being globally recognised today. Yet we lack the self-confidence to
               see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?
               I have a third vision. India must stand up to the world. Because I believe that unless India 
               stands up to the world, no one will respect us. Only strength respects strength. We must be 
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career"""

# Tokenizing sentences
sentences = nltk.sent_tokenize(paragraph)

# Tokenizing words
words = nltk.word_tokenize(paragraph)
Example #38
0
#print ("%s sentences of training data" % len(training_data))

# capture unique stemmed words in the training corpus
corpus_words = {}
class_words = {}
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    # prepare a list of words within each class
    class_words[c] = []

# loop through each sentence in our training data
for data in training_data:
    # tokenize each sentence into words
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a some things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1
  # add the word to our words in class list
            class_words[data['class']].extend([stemmed_word])


# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
#print ("Corpus words and counts: %s \n" % corpus_words)
# Read the data and append SENTENCE_START and SENTENCE_END tokens
import os
os.chdir(r"C:\Users\s6324900\Desktop\Deep learning\RNN")
print "Reading CSV file..."
with open('reddit-comments-2015-08.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
Example #40
0
# TODO - print video title
video_title = doc.find("div", attrs={"class":"cnnVidFooter"}).get_text()
print("Video title: ", video_title)

# TODO - print article content
container = doc.find("div", id="storytext")
content_list = [p.string for p in container.findAll("p") if p.string]
content = "\n".join(content_list)

print("Content: ", content)


# 1-2. Tokenize news article content by words
# TODO - tokenize article content
tokenized_words = nltk.word_tokenize(content)

print(tokenized_words)


# 1-3. POS-Tag tokenized words and sort POS by frequency
# TODO - POS_Tag tokenized words
tagged_list = nltk.pos_tag(tokenized_words)
print(tagged_list)

# TODO - sort POS by frequency
from collections import Counter
counter = Counter([el[1] for el in tagged_list])
print(counter)
# for tag in tagged_list:
#     pass
# Train the unigram tagger
uni_tag = ut(cess_sents)

X,y = ext_ft(cess_sents)

text = ""
f = open('PLIEGO_EJEMPLO.docx', 'rb')
document = Document(f)
for i in document.paragraphs:
    text +=i.text
f.close()
print("=========== PRIMEROS 45 ========")
print(text[:45])

# obtengo los tokens
tokens = word_tokenize(text)
words = tokens[:250]
print("=========== TOKENS 50  ========")
print(words)

"""archivo_salida = open("my_tokens.txt","w")
for word in tokens:
    print(word,file=archivo_salida)
"""
text = nltk.Text(tokens)
print(type(tokens))
print(len(tokens))
print(tokens[:10])

text.collocations()
 def predict_sentence(self, sentence):
   sample =  [[ tag for _,tag in self.tagger.tag(word_tokenize(sentence)) ]]
   probs = [ model.test(sample) for model in self.hmm_models ]
   return probs.index(max(probs))
Example #43
0
def normalize(text):
    '''Return normalized and lemmatized tokens in text.'''
    text = removePunct(text)
    tokens = nltk.word_tokenize(text)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]
Example #44
0
import urllib2
import os,sys
import nltk
import bs4
from bs4 import BeautifulSoup
from nltk import word_tokenize, pos_tag

#Storing the link in a variable
hackernews="https://news.ycombinator.com/"
page=urllib2.urlopen(hackernews)
soup=BeautifulSoup(page, "html.parser")
page_content=soup.prettify()
#Storing the file in a variable
sent = open("filename.txt").read()
#Tokenizing the words stored in the file
words=nltk.word_tokenize(sent)
#Tagging the words of the files based on the parts of speech
tags=nltk.pos_tag(words)
#Taking input from the user
word_key=raw_input("Enter the any keyword to be searched : ")
#Searching the nouns(keywords) from the file
nouns=[word for word,pos in tags \
	if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
#Checking wether the keyword entered by the user is present in the file
with open("filename.txt") as openfile:
    for line in openfile:
        for part in line.split():
            if word_key in part:
				all_links=soup.find_all("a")
				for link in all_links:
   					 print link.get("href")
Example #45
0
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words
        line = ast.literal_eval(line)
        #  print line
        #  exit(1)
        json_data = json.loads(json.dumps(line))
        dict[i] = json_data
        i += 1

l = []
wd = []
for i in dict:
    for j in dict[i]:
        if j == 'reviewText':
            l.append(str(dict[i][j]))

for i in l:
    text = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(text)
    nouns = [word for word, pos in tagged if (pos == 'NN')
             ]  # or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    wd.append(review_word(' '.join(nouns)))

# PROGRESS : List of list of nouns in a review

dictionary = corpora.Dictionary(wd)
corpus = [dictionary.doc2bow(text) for text in wd]

features = []

if __name__ == "__main__":
    ldamodel = models.LdaModel(corpus,
                               id2word=dictionary,
Example #47
0
def tag_sent(raw_file,new_file,labelfile,max_token=50):
    train_json_file = open(new_file, 'w')
    file = open(raw_file, 'r')
    sentences_0 = file.readlines()
    c = 0
    Tkk = []
    ii = 0
    Tkk = {}
    vV = []
    Mlabel = {}
    Alabel = {}
    count_r = {}
    label = []
    f = open(labelfile, 'r')
    fr = f.readlines()
    label = [line.strip('\n') for line in fr]
    f.close()
    for line in sentences_0:
        c += 1
        kk = 1
        count_r[c - 1] = 0
        Tkk[c - 1] = 0
        if not c % 10000:
            print(c)
        sent = json.loads(line.strip('\r\n'))
        flag = 0
        sentText = str(unicodedata.normalize('NFKD', sent['sentText']).encode('ascii', 'ignore')).rstrip('\n').rstrip('\r')
        #sentText=sent['sentText'] python3
        tokens = nltk.word_tokenize(sentText)
        tags=["O"]*len(tokens)
        emIndexByText = {}
        for em in sent['entityMentions']:
            emText = unicodedata.normalize('NFKD', em['text']).encode('ascii', 'ignore')
            # emText=em['text']
            tokens1 = tokens
            em1 = emText.split()
            flagE = True
            if emIndexByText.__contains__(emText):
                flagE = False
            while flagE:
                start, end = find_index(tokens1, em1)
                if start != -1 and end != -1:
                    tokens1 = tokens1[end:]
                    if emText not in emIndexByText:
                        emIndexByText[emText] = [(start, end)]
                    elif not emIndexByText[emText].__contains__((start, end)):
                        offset = emIndexByText[emText][-1][1]
                        emIndexByText[emText].append((start + offset, end + offset))
                else:
                    break
        for rm in sent['relationMentions']:
            if not rm['label'].__eq__('None') and label.__contains__(rm['label']):
                rmlabel = rm["label"]
                if not Alabel.__contains__(rmlabel):
                    Alabel[rmlabel] = [c - 1]
                else:
                    Alabel[rmlabel].append(c - 1)
                em1 = unicodedata.normalize('NFKD', rm['em1Text']).encode('ascii', 'ignore')
                em2 = unicodedata.normalize('NFKD', rm['em2Text']).encode('ascii', 'ignore')
                # em1 = rm["em1Text"] #python3
                # em2=rm['em2Text'] #python3
                if emIndexByText.__contains__(em1) and emIndexByText.__contains__(em2):
                    ind1 = emIndexByText[em1]
                    ind2 = emIndexByText[em2]
                    minind = len(tokens)
                    labelindex = []
                    for i1ind, i1 in enumerate(ind1):
                        for i2ind, i2 in enumerate(ind2):
                            if (i2[0] - i1[1]) * (i2[1] - i1[0]) > 0:
                                if minind > abs(i2[1] - i1[1]):
                                    minind = abs(i2[1] - i1[1])
                                    labelindex = [i1ind, i2ind]
                    if labelindex:
                        i1ind = labelindex[0]
                        i2ind = labelindex[1]
                        start1 = ind1[i1ind][0]
                        end1 = ind1[i1ind][1]
                        start2 = ind2[i2ind][0]
                        end2 = ind2[i2ind][1]
                        tag1Previous = []
                        tag2Previous = []
                        if end1 - start1 == 1:
                            tag1Previous.append(rmlabel + "__E1S")
                        elif end1 - start1 == 2:
                            tag1Previous.append(rmlabel + "__E1B")
                            tag1Previous.append(rmlabel + "__E1L")
                        else:
                            tag1Previous.append(rmlabel + "__E1B")
                            for ei in range(start1 + 1, end1 - 1):
                                tag1Previous.append(rmlabel + "__E1I")
                            tag1Previous.append(rmlabel + "__E1L")
                        if end2 - start2 == 1:
                            tag2Previous.append(rmlabel + "__E2S")
                        elif end2 - start2 == 2:
                            tag2Previous.append(rmlabel + "__E2B")
                            tag2Previous.append(rmlabel + "__E2L")
                        else:
                            tag2Previous.append(rmlabel + "__E2B")
                            for ei in range(start2 + 1, end2 - 1):
                                tag2Previous.append(rmlabel + "__E2I")
                            tag2Previous.append(rmlabel + "__E2L")
                        while True:
                            valid1 = True
                            vT1 = 0
                            for ei in range(start1, end1):
                                if not tags[ei].__eq__('O'):
                                    valid1 = False
                                    break
                            if not valid1:
                                valid1 = True
                                vT1 = 1
                                for ei in range(start1, end1):
                                    if not tags[ei].__eq__(tag1Previous[ei - start1]):
                                        valid1 = False
                                        vT1 = 0
                                        break
                            valid2 = True
                            vT2 = 0
                            for ei in range(start2, end2):
                                if not tags[ei].__eq__('O'):
                                    valid2 = False
                                    break
                            if not valid2:
                                valid2 = True
                                vT2 = 1
                                for ei in range(start2, end2):
                                    if not tags[ei].__eq__(tag2Previous[ei - start2]):
                                        valid2 = False
                                        vT2 = 0
                                        break
                            if valid1 and valid2:
                                for ei in range(start2, end2):
                                    tags[ei] = tag2Previous[ei - start2]
                                for ei in range(start1, end1):
                                    tags[ei] = tag1Previous[ei - start1]
                                Tkk[c - 1] = kk
                                if not (vT1 and vT2):
                                    ii += 1
                                    count_r[c - 1] += 1
                                    if not Mlabel.__contains__(rmlabel):
                                        Mlabel[rmlabel] = [c - 1]
                                    else:
                                        Mlabel[rmlabel].append(c - 1)
                                flag = 1
                                if (vT1 or vT2) and not (vT1 and vT2):
                                    vV.append(c - 1)
                                break
                            else:
                                start1 += len(tokens)
                                end1 += len(tokens)
                                start2 += len(tokens)
                                end2 += len(tokens)
                            if end2 > kk * len(tokens):
                                kk += 1
                                for ki in range(len(tokens)):
                                    tags.append('O')
        newsent = dict()
        newsent['tokens'] = tokens
        newsent['tags'] = tags
        newsent['lentags/lentokens'] = kk * flag
        train_json_file.write(json.dumps(newsent) + '\n')
    train_json_file.close()
    return Tkk, vV, ii, Alabel, count_r, Mlabel
Example #48
0
def parseInput(userInput):
	'''Uses nltk's pos_tag to tag the parts of speech of the users input'''
	tokens = nltk.word_tokenize(userInput)
	pos = nltk.pos_tag(tokens)
	return pos
Example #49
0
import nltk
from nltk.corpus import stopwords
import string
import re
import pdb
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

with open("comments") as f:
	t = f.read()

t = re.sub(r'http\S+', '', t)
tokens = [nltk.word_tokenize(item) for item in t.split('\n\n\n\n')]

# pdb.set_trace()
# print(tokens)

stop_words = stopwords.words('english')
stop_words.append('I')

cleantext = []

with open("comments2",'w') as file:
	for comment in tokens:
		for word in comment:
			word = re.sub('[^A-Za-z0-9]+', '', word)
			if (word not in stop_words and word!='' and word!='\n'):
				# print(word)
Example #50
0
 def pos_tag_words(self):
     pos_text = nltk.pos_tag(nltk.word_tokenize(
         self.output['preprocessed']))
     self.output['pos_tagged'] = " ".join(
         [pos + "-" + word for word, pos in pos_text])
Example #51
0
    def read_jsonfile(self, json_file, create_vocab, is_test, test_state):
        # print 'json file ', json_file
        try:
            with open(json_file, "rb") as file:
                dialogue = orjson.loads(file.read())
                #dialogue = json.load(open(json_file))
        except json.decoder.JSONDecodeError:
            return None
        filter(None, dialogue)
        # dialogue_multimodal is a list of training instances, each of len max_utter, and each ending with a system response. Whenever a dialogue has context less than max_utter, it is accordingly padded
        dialogue_vocab = {}
        dialogue_multimodal = []
        if self.task_type == "text":
            dialogue_context_text_task_text = []
            dialogue_context_image_task_text = []
            dialogue_target_text_task_text = []

        if self.task_type == "image":
            dialogue_context_text_task_image = []
            dialogue_context_image_task_image = []
            dialogue_target_image_task_image = []

        dialogue_instance_multimodal = []
        for utterances in dialogue:

            if utterances is None or len(utterances) == 0:
                continue
            if not isinstance(utterances, list):
                utterances = [utterances]

            for utterance in utterances:
                if utterance is None:
                    continue
                if not isinstance(utterance, dict):
                    print('impossible ', utterance, json_file)
                    raise Exception('error in reading dialogue json')
                    continue
                speaker = utterance['speaker']
                if 'images' not in utterance[
                        'utterance'] or 'nlg' not in utterance['utterance']:
                    continue
                images = utterance['utterance']['images']
                nlg = utterance['utterance']['nlg']

                if nlg is not None:
                    nlg = nlg.strip()  # .encode('utf-8')
                if nlg is None:
                    nlg = ""

                try:
                    nlg = nlg.lower()
                except AttributeError:
                    pass

                try:
                    nlg = nlg.replace("|", "")
                except TypeError:
                    pass

                nlg_words = nltk.word_tokenize(nlg)

                # nlg_words = [x.encode('utf-8') for x in nlg_words]

                if create_vocab:
                    self.word_counter.update(nlg_words)
                dialogue_instance_multimodal.append({
                    'images': images,
                    'nlg': nlg
                })
                if speaker == "system" and (
                        test_state is None or is_test is False or
                    (last_question_type is not None and test_state is not None
                     and is_test is True
                     and test_state in last_question_type)):
                    last_utterance = dialogue_instance_multimodal[-1]
                    # print 'dialogue instance ',[x['nlg'] for x in dialogue_instance_multimodal[:-1]]
                    # print 'last utterance ', last_utterance['nlg']
                    # print ''
                    if self.task_type == "text" and (
                            last_utterance['nlg'] is None
                            or last_utterance['nlg'] == ""):
                        continue
                    if self.task_type == "image" and (
                            last_utterance['images'] is None
                            or len(last_utterance['images']) == 0):
                        continue
                    padded_clipped_dialogue = self.pad_or_clip_dialogue(
                        dialogue_instance_multimodal)
                    if len(padded_clipped_dialogue) != (self.max_utter + 1):
                        raise Exception(
                            'some problem with dialogue instance, len != max_utter+1'
                        )
                    # dialogue_instance_task_test is a max_utter length list of utterances where the last utterance in the list is the target utterance
                    dialogue_instance_text_context = [
                        x['nlg'] if x['nlg'] is not None else ''
                        for x in padded_clipped_dialogue[:-1]
                    ]
                    # dialogue_instance_task_image is a max_utter length list of image-lists where the last entry in the list is a single image instead of a list and it is the target image
                    dialogue_instance_image_context = [
                        x['images'] if x['images'] is not None else []
                        for x in padded_clipped_dialogue[:-1]
                    ]

                    # print 'dialogue_instance_text_context ', dialogue_instance_text_context
                    # print ''
                    # print 'dialogue_instance_image_context ', dialogue_instance_image_context
                    if len(dialogue_instance_text_context) != self.max_utter:
                        raise Exception(
                            'len(dialogue_instance_text_context)!=self.max_utter'
                        )
                    if len(dialogue_instance_image_context) != self.max_utter:
                        raise Exception(
                            'len(dialogue_instance_image_context)!=self.max_utter'
                        )
                    if self.task_type == "text":
                        dialogue_target_text = dialogue_instance_multimodal[
                            -1]['nlg']
                        dialogue_instance_context_text_task_text = copy.deepcopy(
                            dialogue_instance_text_context)
                        dialogue_instance_context_image_task_text = copy.deepcopy(
                            dialogue_instance_image_context)
                        dialogue_context_text_task_text.append(
                            dialogue_instance_context_text_task_text)
                        dialogue_context_image_task_text.append(
                            dialogue_instance_context_image_task_text)
                        dialogue_target_text_task_text.append(
                            dialogue_target_text)
                    if self.task_type == "image":
                        dialogue_target_images = dialogue_instance_multimodal[
                            -1]['images']
                        for image in images:
                            dialogue_instance_context_text_task_image = copy.deepcopy(
                                dialogue_instance_text_context)
                            dialogue_instance_context_image_task_image = copy.deepcopy(
                                dialogue_instance_image_context)
                            dialogue_context_text_task_image.append(
                                dialogue_instance_context_text_task_image)
                            dialogue_context_image_task_image.append(
                                dialogue_instance_context_image_task_image)
                            dialogue_target_image_task_image.append(image)
                if 'question-type' in utterance and test_state is not None:
                    last_question_type = utterance['question-type']
                elif speaker != "system":
                    last_question_type = None
        if self.task_type == "text":
            with open(self.dialogue_context_text_task_text_file, 'a') as fp:
                for dialogue_instance in dialogue_context_text_task_text:
                    dialogue_instance = '|'.join(dialogue_instance)
                    fp.write(dialogue_instance + '\n')
            with open(self.dialogue_context_image_task_text_file, 'a') as fp:
                for dialogue_instance in dialogue_context_image_task_text:
                    image_context = None
                    if len(dialogue_instance) != self.max_utter:
                        raise Exception(
                            'len(dialogue_instance_image_context)!=self.max_utter'
                        )
                    for images in dialogue_instance:
                        if image_context is None:
                            try:
                                image_context = ",".join(images)
                            except TypeError:  # If images = [None]
                                image_context = ",".join([])
                        else:
                            try:
                                image_context = image_context + "|" + ",".join(
                                    images)
                            except TypeError:  # If images = [None]
                                image_context = image_context + "|" + ",".join(
                                    [])
                    if len(image_context.split("|")) != self.max_utter:
                        raise Exception(
                            'len(dialogue_instance_image_context)!=self.max_utter'
                        )
                    fp.write(image_context + '\n')
            with open(self.dialogue_target_text_task_text_file, 'a') as fp:
                for dialogue_instance in dialogue_target_text_task_text:
                    fp.write(dialogue_instance + '\n')
        if self.task_type == "image":
            with open(self.dialogue_context_text_task_image_file, 'a') as fp:
                for dialogue_instance in dialogue_context_text_task_image:
                    dialogue_instance = '|'.join(dialogue_instance)
                    fp.write(dialogue_instance + '\n')
            with open(self.dialogue_context_image_task_image_file, 'a') as fp:
                for dialogue_instance in dialogue_context_image_task_image:
                    image_context = None
                    if len(dialogue_instance) != self.max_utter:
                        raise Exception(
                            'len(dialogue_instance_image_context)!=self.max_utter'
                        )
                    for images in dialogue_instance:
                        if image_context is None:
                            image_context = ",".join(images)
                        else:
                            image_context = image_context + "|" + ",".join(
                                images)
                    if len(image_context.split("|")) != self.max_utter:
                        raise Exception(
                            'len(dialogue_instance_image_context)!=self.max_utter'
                        )
                    fp.write(image_context + '\n')
            with open(self.dialogue_target_image_task_image_file, 'a') as fp:
                for dialogue_instance in dialogue_target_image_task_image:
                    fp.write(dialogue_instance + '\n')
Example #52
0
def trainModel(data, stemmer):
    #print(data["intents"])
    try:
        with open("data.pickle", "rb") as f:
            words,labels, training, output = pickle.load(f)
    except:
        words = []
        labels = []
        docs_x  = []
        docs_y = []
        for intent in data["intents"]:
            if intent["tag"] not in labels:
                labels.append(intent["tag"])
            for pattern in intent["patterns"]:
                wrds = nltk.word_tokenize(pattern)
                words.extend(wrds)
                docs_x.append(wrds)
                docs_y.append(intent["tag"])

        words = [stemmer.stem(w.lower()) for w in words if w != "?"]
        words = sorted(list(set(words)))

        labels = sorted(labels)

        training = []
        output = []

        out_empty = [0 for _ in range(len(labels))]


        for x, doc in enumerate(docs_x):
            bag =[]
            wrds = [stemmer.stem(w) for w in doc]

            for w in words:
                if w in wrds:
                    bag.append(1)
                else:
                    bag.append(0)

            output_row = out_empty[:]
            output_row[labels.index(docs_y[x])] = 1

            training.append(bag)
            output.append(output_row)
            
        training = numpy.array(training)
        output  = numpy.array(output)
        with open("data.pickle", "wb") as f:
            pickle.dump((words,labels, training, output),f)        

    tensorflow.reset_default_graph()

    net = tflearn.input_data(shape=[None, len(training[0])])
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, len(output[0]), activation="softmax")
    net = tflearn.regression(net)

    model = tflearn.DNN(net) 
    try:
        model.load("./model.tflearn")
        
    except RuntimeError:
        model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)   
        model.save("./model.tflearn")  
    
    return  (words,labels, model)  
Example #53
0
def process_questions(args):
    ''' Encode question tokens'''
    print('Loading data')
    with open(args.annotation_file, 'r') as dataset_file:
        instances = json.load(dataset_file)

    # Either create the vocab or load it from disk
    if args.mode in ['train']:
        print('Building vocab')
        answer_cnt = {}
        for instance in instances:
            answer = instance['answer']
            answer_cnt[answer] = answer_cnt.get(answer, 0) + 1

        answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1}
        answer_counter = Counter(answer_cnt)
        frequent_answers = answer_counter.most_common(args.answer_top)
        total_ans = sum(item[1] for item in answer_counter.items())
        total_freq_ans = sum(item[1] for item in frequent_answers)
        print("Number of unique answers:", len(answer_counter))
        print("Total number of answers:", total_ans)
        print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))

        for token, cnt in Counter(answer_cnt).most_common(args.answer_top):
            answer_token_to_idx[token] = len(answer_token_to_idx)
        print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))

        question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
        for i, instance in enumerate(instances):
            question = instance['question'].lower()[:-1]
            for token in nltk.word_tokenize(question):
                if token not in question_token_to_idx:
                    question_token_to_idx[token] = len(question_token_to_idx)
        print('Get question_token_to_idx')
        print(len(question_token_to_idx))

        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
            'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1}
        }

        print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset))
        with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f:
            json.dump(vocab, f, indent=4)
    else:
        print('Loading vocab')
        with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f:
            vocab = json.load(f)

    # Encode all questions
    print('Encoding data')
    questions_encoded = []
    questions_len = []
    question_ids = []
    video_ids_tbw = []
    video_names_tbw = []
    all_answers = []
    for idx, instance in enumerate(instances):
        question = instance['question'].lower()[:-1]
        question_tokens = nltk.word_tokenize(question)
        question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
        questions_encoded.append(question_encoded)
        questions_len.append(len(question_encoded))
        question_ids.append(idx)
        im_name = instance['video_id']
        video_ids_tbw.append(im_name)
        video_names_tbw.append(im_name)

        if instance['answer'] in vocab['answer_token_to_idx']:
            answer = vocab['answer_token_to_idx'][instance['answer']]
        elif args.mode in ['train']:
            answer = 0
        elif args.mode in ['val', 'test']:
            answer = 1

        all_answers.append(answer)
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_len = np.asarray(questions_len, dtype=np.int32)
    print(questions_encoded.shape)

    glove_matrix = None
    if args.mode == 'train':
        token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
        print("Load glove from %s" % args.glove_pt)
        glove = pickle.load(open(args.glove_pt, 'rb'))
        dim_word = glove['the'].shape[0]
        glove_matrix = []
        for i in range(len(token_itow)):
            vector = glove.get(token_itow[i], np.zeros((dim_word,)))
            glove_matrix.append(vector)
        glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
        print(glove_matrix.shape)

    print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode))
    obj = {
        'questions': questions_encoded,
        'questions_len': questions_len,
        'question_id': question_ids,
        'video_ids': np.asarray(video_ids_tbw),
        'video_names': np.array(video_names_tbw),
        'answers': all_answers,
        'glove': glove_matrix,
    }
    with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f:
        pickle.dump(obj, f)
Example #54
0
    def binarize_corpora(self, dialogue_file_text, dialogue_file_image,
                         dialogue_target_file, task_type, dialogue_pkl_file):
        binarized_corpus = []
        binarized_corpus_text_context = []
        binarized_corpus_image_context = []
        binarized_corpus_target = []
        unknowns = 0.
        num_terms = 0.
        freqs = collections.defaultdict(lambda: 0)
        df = collections.defaultdict(lambda: 0)
        num_instances = 0
        with open(dialogue_file_text) as textlines, open(
                dialogue_file_image) as imagelines, open(
                    dialogue_target_file) as targetlines:
            for text_context, image_context, target in zip(
                    textlines, imagelines, targetlines):
                text_context = text_context.lower().strip()
                image_context = image_context.strip()
                target = target
                # print 'text_content ', text_context
                # print 'image_content ', image_context
                # print 'target ', target
                # print ''
                num_instances += 1
                if num_instances % 10000 == 0:
                    print('finished ', num_instances, ' instances')
                utterances = text_context.split('|')
                binarized_text_context = []
                for utterance in utterances:
                    try:
                        utterance_words = nltk.word_tokenize(utterance)
                    except:
                        utterance_words = utterance.split(' ')
                    utterance_words = self.pad_or_clip_utterance(
                        utterance_words)
                    if self.end_word_symbol not in utterance_words:
                        print('utterance ', utterance)
                        print('utterance words ', utterance_words)
                        raise Exception('utterance does not have end symbol')
                    utterance_word_ids = []
                    for word in utterance_words:
                        word_id = self.vocab_dict.get(word,
                                                      self.unknown_word_id)
                        utterance_word_ids.append(word_id)
                        unknowns += 1 * (word_id == self.unknown_word_id)
                        freqs[word_id] += 1
                    if self.end_word_id not in utterance_word_ids:
                        print('utterance word ids ', utterance_word_ids)
                        raise Exception(
                            'utterance word ids does not have end word id')
                    num_terms += len(utterance_words)

                    unique_word_indices = set(utterance_word_ids)
                    for word_id in unique_word_indices:
                        df[word_id] += 1
                    binarized_text_context.append(utterance_word_ids)
                if len(binarized_text_context) != self.max_utter:
                    raise Exception(
                        'binarized_text_context should be a list of length max_utter, found length ',
                        len(binarized_text_context))
                binarized_image_context = [
                    self.pad_or_clip_images(x.split(","))
                    for x in image_context.split('|')
                ]

                if len(binarized_image_context) != self.max_utter:
                    raise Exception(
                        'binarized_image_context should be a list of length max_utter, found length ',
                        len(binarized_image_context))

                binarized_target = None
                if task_type == "text":
                    utterance = target
                    try:
                        utterance_words = nltk.word_tokenize(utterance)
                    except:
                        utterance_words = utterance.split(' ')
                    utterance_words = self.pad_or_clip_utterance(
                        utterance_words)
                    if self.end_word_symbol not in utterance_words:
                        print('utterance ', utterance)
                        print('utterance words ', utterance_words)
                        raise Exception('utterance does not have end symbol')
                    utterance_word_ids = []
                    for word in utterance_words:
                        word_id = self.vocab_dict.get(word,
                                                      self.unknown_word_id)
                        utterance_word_ids.append(word_id)
                        unknowns += 1 * (word_id == self.unknown_word_id)
                        freqs[word_id] += 1
                    if self.end_word_id not in utterance_word_ids:
                        print('utterance word ids ', utterance_word_ids)
                        raise Exception(
                            'utterance word ids does not have end word id')
                    num_terms += len(utterance_words)

                    unique_word_indices = set(utterance_word_ids)
                    for word_id in unique_word_indices:
                        df[word_id] += 1
                    binarized_target = utterance_word_ids
                if task_type == "image":
                    binarized_target = target
                # binarized_corpus_text_context.append(binarized_text_context)
                # binarized_corpus_image_context.append(binarized_image_context)
                # binarized_corpus_target.append(binarized_target)
                binarized_corpus.append([
                    binarized_text_context, binarized_image_context,
                    binarized_target
                ])
        # binarized_corpus = [binarized_corpus_text_context,  binarized_corpus_image_context, binarized_corpus_target]
        self.safe_pickle(binarized_corpus, dialogue_pkl_file)
        if not os.path.isfile(self.vocab_file):
            self.safe_pickle([(word, word_id, freqs[word_id], df[word_id])
                              for word, word_id in self.vocab_dict.items()],
                             self.vocab_stats_file)
            inverted_vocab_dict = {
                word_id: word
                for word, word_id in self.vocab_dict.items()
            }
            self.safe_pickle(inverted_vocab_dict, self.vocab_file)

            print('dumped vocab in ', self.vocab_file)
        self.logger.info("Number of unknowns %d" % unknowns)
        self.logger.info("Number of terms %d" % num_terms)
        self.logger.info(
            "Mean document length %f" %
            float(sum(map(len, binarized_corpus)) / len(binarized_corpus)))
        self.logger.info(
            "Writing training %d dialogues (%d left out)" %
            (len(binarized_corpus), num_instances + 1 - len(binarized_corpus)))
Example #55
0
                     'cue-words-score'] = (matches /
                                           len(sentence_mapper[sentence]))
#This metric is very biased - if only 1 cue-word is given then its presence
#will give a score of 1 - highly biases the sentences.
#May be try to put a metric like - length of sentence on the denominator
#and num words which have similarity more than .50 with the cue words - currently only cue-words

#----------Rating a sentence according to position and length------------
#Using Barrera and Verma's first model to score sentence based on the position
total_sentences = len(original_sentences)
alpha = 2

for index, sentence in enumerate(original_sentences):
    vector_space.loc[sentence, 'position-score'] = (np.cos(
        (2 * np.pi * index) / (total_sentences - 1)) + alpha - 1) / (alpha)
    vector_space.loc[sentence, 'length-score'] = len(word_tokenize(sentence))

mean = np.mean(vector_space['length-score'])
std_dev = np.sqrt(np.var(vector_space['length-score']))
max_val = max(
    np.abs(min(vector_space['length-score']) - mean) / std_dev,
    np.abs(max(vector_space['length-score']) - mean) / std_dev)

#Rating mid-sized sentences with higher ratings
vector_space['length-score'] = vector_space['length-score'].apply(
    lambda val: max_val - np.abs(mean - val) / std_dev)

#-------Summarization Finalized Results--------
#Calculating the final score for each sentence
#Using - tf-score, length-score, position-score, cue-words-score and the paragraph-score
vector_space['sentence-score'] = vector_space.apply(
Example #56
0
 def make_tokenized_matrix_eng(self, texts: List[str]):
     self.tokenized_matrix = []
     print('making tokenized matrix...')
     for text in tqdm(texts):
         self.tokenized_matrix.append(
             [self.lemmatizer(word) for word in word_tokenize(text)])
Example #57
0
 def _word_tokenize(self, text):
     return nltk.word_tokenize(text)
Example #58
0
def prepText(sentence):
    tokenized = word_tokenize(sentence)
    text = nltk.Text(tokenized)
    return text
Example #59
0
#We import extractor function from extract_name.py file. The extractor
#function helps us in extracting important features from the report as
#mentioned in the first para.

import os
import re, nltk, psycopg2
import dates, a_name
import extract_name
from extract_name import extractor

d = 0

#In the next statement we open r_fil_date.txt file. It contains the
#info about the location of the dates file of various company ids for
#e.g. /home/finance/data/600045616/dates happens to be the dates file
#of the company with company id 600045616
fo = open("/home/finance/reports2sql/r_fil_date.txt", "rb+")
raw = fo.read()

#We use nltk.work_tokenize to break our raw data into tokens where each
#token is a location of dates file of a company id.
locs = nltk.word_tokenize(raw)

#We loop here to go through every date file. THen from corresponding
#date file we extract the dates on which reports were published. From
#the reports we extract the relevant features and put it into our
#database ratings1
for t in locs:
    d = d + extractor(t)
Example #60
0
def text_to_tokens(src_file,
                   body_start=0,
                   body_end=-1,
                   chap_pat=r'^\s*Chapter.*$',
                   para_pat=r'\n\n+',
                   sent_pat=r'([.;?!"“”]+)',
                   token_pat=r'([\W_]+)'):

    # Text to lines
    lines = open(src_file, 'r', encoding='utf-8').readlines()
    lines = lines[body_start - 1:body_end + 1]
    df = pd.DataFrame({'line_str': lines})
    df.index.name = 'line_id'
    del (lines)

    # FIX CHARACTERS TO IMPROVE TOKENIZATION
    df.line_str = df.line_str.str.replace('—', ' — ')
    df.line_str = df.line_str.str.replace('-', ' - ')

    # Lines to Chapters
    mask = df.line_str.str.match(chap_pat)
    df.loc[mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill().bfill().astype('int')
    chap_ids = df.chap_id.unique().tolist()
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
    chaps = df.groupby('chap_num')\
        .apply(lambda x: ''.join(x.line_str))\
        .to_frame()\
        .rename(columns={0:'chap_str'})
    del (df)

    # Chapters to Paragraphs
    paras = chaps.chap_str.str.split(para_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'para_str'})
    paras.index.names = PARAS
    paras.para_str = paras.para_str.str.strip()
    paras.para_str = paras.para_str.str.replace(r'\n', ' ')
    paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
    paras = paras[~paras.para_str.str.match(r'^\s*$')]
    del (chaps)

    # Paragraphs to Sentences
    #     sents = paras.para_str.str.split(sent_pat, expand=True)\
    sents = paras.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = SENTS
    del (paras)

    # Sentences to Tokens
    #     tokens = sents.sent_str.str.split(token_pat, expand=True)\
    tokens = sents.sent_str\
        .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    tokens.index.names = OHCO
    del (sents)

    tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
    tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
    tokens = tokens.drop('pos_tuple', 1)

    # Tag punctuation and numbers
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')

    # Extract vocab with minimal normalization
    WORDS = (tokens.punc == 0) & (tokens.num == 0)
    tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\
        .str.replace(r'["_*.]', '')

    vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
        .reset_index()\
        .rename(columns={'index':'term_str', 'term_str':'n'})
    vocab = vocab.sort_values('term_str').reset_index(drop=True)
    vocab.index.name = 'term_id'

    # Get priors for V
    vocab['p'] = vocab.n / vocab.n.sum()

    # Add stems
    stemmer = nltk.stem.porter.PorterStemmer()
    vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

    # Define stopwords
    sw = pd.DataFrame({'x': 1}, index=nltk.corpus.stopwords.words('english'))
    vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
    del (sw)

    # Add term_ids to tokens
    tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
        .set_index('term_str').term_id).fillna(-1).astype('int')

    return tokens, vocab