Ejemplo n.º 1
0
def updateNetwork(query, network, qp, searcher, tlc, field, ntype):
  #find the top 50 documents
  q = qp.parse(unicode(query))
  totalText = ''
  total = 0.0
  tmin = -1000
  tmax = 1000
  terms = set()
  try:
    searcher.search_with_collector(q, tlc)
  except TimeLimit:
    print '--LONG-- ', query

  results = tlc.results()
  for entry in results:
    totalText += entry[field] + ' '

  finder = BigramCollocationFinder.from_words(word_tokenize(totalText))
  #update the network

  rList = finder.score_ngrams(biMeas.pmi)

  for rTuple in rList:
    total += rTuple[1]
    if tmin > rTuple[1]:
      tmin = rTuple[1]
    if tmax < rTuple[1]:
      tmax = rTuple[1]

  for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]):
    if (len(terms) < 3000 and finder.ngram_fd[rTuple[0]] > 2
     ) or (finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query or
           rTuple[0][1] in query and len(terms) < 4000):
      #if (len(terms) < 3000  and finder.ngram_fd[rTuple[0]] > 2) or (rTuple[0][0] in query or rTuple[0][1] in query and len(terms) < 4000):
      a = rTuple[0][0]
      if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a):
        if a not in network:
          network[a] = {}
          terms.add(a)
        b = rTuple[0][1]
        if len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite(
            b):
          if b not in network[a]:
            network[a][b] = {}
            terms.add(b)
          network[a][b][ntype] = network[a][b].setdefault(ntype, 0.0) + (
              (rTuple[1] - tmin) / (tmax - tmin))

  print query, ntype, len(terms)

  return terms
Ejemplo n.º 2
0
  def updateNetworkFromText(self, query, text, ntype):

    total = 0.0
    tmin = -1000
    tmax = 1000

    qsplit = query.split()
    for entry in qsplit:
      term = self.porter.stem(entry)
      self.network[term] = {}
      self.terms.add(term)

    finder = BigramCollocationFinder.from_words(word_tokenize(text))
    #update the network

    rList = finder.score_ngrams(self.biMeas.pmi)
    for rTuple in rList:
      total += rTuple[1]
      if tmin > rTuple[1]:
        tmin = rTuple[1]
      if tmax < rTuple[1]:
        tmax = rTuple[1]

    for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]):
      if (len(self.terms) < 1000  and finder.ngram_fd[rTuple[0]] > 2) or \
			((finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query) or \
			 (rTuple[0][1] in query and len(self.terms) < 1500)):
        noSymbA = SYMBreg.sub('', rTuple[0][0])
        noSymbB = SYMBreg.sub('', rTuple[0][1])

        if noSymbA not in stopSet and noSymbB not in stopSet:
          a = self.porter.stem(noSymbA)
          b = self.porter.stem(noSymbB)
          if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a) \
					and len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite(b):
            if a not in self.network:
              self.network[a] = {}
              self.terms.add(a)
            if b not in self.network[a]:
              self.network[a][b] = {}
              self.terms.add(b)
            self.network[a][b][ntype] = self.network[a][b].setdefault(
                ntype, 0.0) + ((rTuple[1] - tmin) / (tmax - tmin))

    print query, ntype, len(self.terms)