def postProcess(start1, query): #print; print query; print res = linggleit(query) #print res phrases = [ [ w.replace('<strong>','').replace('</strong>','') for w in ngram['phrase'][start1:]] for ngram in res] phrases = [ [ sqlite.search_lemma(ph[0].strip().lower())]+ ph[1:] for ph in phrases] phrases = [ ' '.join([x.strip() for x in ph]) for ph in phrases] counts = [ ngram['count_str'] for ngram in res] counts = [ int(x.replace(',','')) for x in counts] ngramCounts = zip(phrases, counts) ngramCounts.sort(key=lambda x: x[0]) ngramCounts = [ (ngram, sum( [x[1] for x in ngramcounts ] )) \ for ngram, ngramcounts in groupby(ngramCounts, key=lambda x:x[0]) ] ngramCounts.sort(key=lambda x:x[1], reverse=True) resList = [] for ngram, count in ngramCounts: #print '%s\t%s' % (ngram, count) resList.append((ngram, count)) return resList
def vpCollocation(headword): template1 = 'pron. %s prep. ?n.' start1 = 1 query = template1 % headword post = postProcess(start1, query) # combine "pay attention to detail" & "... to details"-->"pay attention to detail/details" tmp = [] removeIndex = [] for i in range(len(post)): lastWord = post[i][0].split(' ')[-1] lemma = sqlite.search_lemma(lastWord.strip().lower()) if str(lastWord) != str(lemma): for j in range(len(post)): _lastWord = post[j][0].split(' ')[-1] if str(lemma) == str(_lastWord): tmp.append([ post[j][0] + '/' + lastWord, int(post[j][1]) + int(post[i][1])]) removeIndex.append(i) removeIndex.append(j) for ind in removeIndex: post.remove(post[int(ind)]) res = tmp + post res.sort(key=lambda x:x[1], reverse=True) return res