Exemple #1
0
def main():
  stem = nltk.stem.LancasterStemmer()
  cleanword = lambda w : stem.stem(w.strip(w).lower())
  bib = btparse.load(sys.argv[1])
  aid = np.random.randint(len(bib))
  while ('abstract' in bib[aid].keys()) == False:
    aid = np.random.randint(len(bib))
  
  abstract = nltk.wordpunct_tokenize(bib[aid]['abstract']+" "+bib[aid]['title'])
  q_vec0 = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")])
  
  q_vec = []
  q_val  = []
  for w in q_vec0:
    w = cleanword(w)
    if len(w)>2 and w not in ignore_list and re.search('\\\\',w) == None:
      if (w in q_vec) == False:
        q_vec.append(w)
        q_val.append(1)
      else:
        q_val[-1] += 1
  
  q_val = np.array(q_val)/np.sqrt(np.dot(q_val,q_val))
  prob = np.zeros(len(bib))
  
  if pytools:
    progress = pytools.ProgressBar("Analysing",len(bib))
    progress.draw()
  for ind,entry in enumerate(bib):
    if ind != aid and ('abstract' in bib[ind].keys()):
      abstract = nltk.wordpunct_tokenize(bib[ind]['abstract']+" "+bib[ind]['title'])
      r_vec = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")])
      r_val = np.zeros(len(q_val))
      for w in r_vec:
        w = cleanword(w)
        if w in q_vec:
          r_val[q_vec.index(w)] += 1
      mod = np.dot(r_val,r_val)
      if mod > 0:
        prob[ind] = np.dot(r_val/np.sqrt(mod),q_val)
    if pytools: progress.progress()
  if pytools: print ""
  
  # sort based on probability (best first)
  inds_sort = np.argsort(prob)[::-1]
  
  print 'similar papers to:\n\t%s\n\t\tby: %s\n'%(bib[aid]['title'],bib[aid]['author'])
  for i in range(10):
    best = inds_sort[i]
    print '%3d.\t%s\n\t\tby: %s\n\t\tid = %3d, prob = %f\n'%(i+1,bib[best]['title'],bib[best]['author'],best,prob[best])
Exemple #2
0
    if item.has_key("abstract") and item.has_key("title"):
      text = nltk.wordpunct_tokenize(item["abstract"] + " " + item["title"])
      for word in [x[0] for x in nltk.pos_tag(text) if x[1] in ("NN")]:
        word = stem.stem(word.strip(strip_chars).lower())
        if len(word)>1 and word not in ignore_list:
          try:
            wordvector[word] += 1
          except KeyError:
            wordvector[word] = 1
    if pytools: progress.progress()
  if pytools: print ""
  sortedwordvector = sorted(wordvector.iteritems(), key=operator.itemgetter(1))
  return sortedwordvector[-numkeep:]

if __name__ == "__main__":
  if len(sys.argv) < 2:
    print "Usage: ./%s input.bib output.dat [numkeep]"%sys.argv[0]
    exit(127)
  elif len(sys.argv) == 4:
    numkeep = int(sys.argv[3])
  else:
    numkeep = 0

  bib = btparse.load(sys.argv[1])
  globalWordVector = getGlobalWordVector(bib, numkeep=numkeep)

  cPickle.dump([x[0] for x in globalWordVector],open(sys.argv[2],"w+"))
  print "Top 50 words:"
  for word, count in globalWordVector[-50:]:
    print "%-15s %d"%(word,count)