def checkTxt(pHashed, pProcessed, txt, lineNum, m, rank):
  '''Check for matches between pHashed (hashed pattern) and txt (one chunk of hashed corpus text)'''

  matches = []

  # for each m-tuple in corpus
  for k,txtMtuple in enumerate(izip(*[iter(txt[i:]) for i in xrange(m)])):

    # for m-tuples in pattern -- might just use izip here
    for i in range(len(pHashed)-m+1): # first word in seqs

      seq = pHashed[i:i+m]

      broken = m # not broken
      for j,hashedWord in enumerate(seq):

        if hashedWord != txtMtuple[j]:
          broken = j
          break


      if broken == m: # was not redefined
          matches.append((k,' '.join(pProcessed[i:i+m])))


  if len(matches) > 0:
    processMatches(matches,m) # print out matches
Ejemplo n.º 2
0
def full_search(hashedData, pat, m=20):
    """Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches."""

    # Hash words in pattern
    pHashed = []
    pProcessed = []
    matches = []

    for word in pat.split():
        new = word.translate(string.maketrans("", ""), string.punctuation).upper()
        pProcessed.append(new)
        pHashed.append(letsHash(new))

    # for each m-tuple in corpus
    for k, txtMtuple in enumerate(izip(*[iter(hashedData[i:]) for i in xrange(m)])):

        # for m-tuples in pattern -- might just use izip here
        for i in range(len(pHashed) - m + 1):  # first word in seqs

            seq = pHashed[i : i + m]

            broken = m  # not broken
            for j, hashedWord in enumerate(seq):

                if hashedWord != txtMtuple[j]:
                    broken = j
                    break

            if broken == m:  # was not redefined
                matches.append((k, " ".join(pProcessed[i : i + m])))

    if len(matches) > 0:
        processMatches(matches, m)  # print out matches
Ejemplo n.º 3
0
def full_search(hashedData, pat, m=20):
    '''Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches.'''

    # Hash words in pattern
    pHashed = []
    pProcessed = []
    matches = []

    for word in (pat.split()):
        new = word.translate(string.maketrans("", ""),
                             string.punctuation).upper()
        pProcessed.append(new)
        pHashed.append(letsHash(new))

    # for each m-tuple in corpus
    for k, txtMtuple in enumerate(
            izip(*[iter(hashedData[i:]) for i in xrange(m)])):

        # for m-tuples in pattern -- might just use izip here
        for i in range(len(pHashed) - m + 1):  # first word in seqs

            seq = pHashed[i:i + m]

            broken = m  # not broken
            for j, hashedWord in enumerate(seq):

                if hashedWord != txtMtuple[j]:
                    broken = j
                    break

            if broken == m:  # was not redefined
                matches.append((k, ' '.join(pProcessed[i:i + m])))

    if len(matches) > 0:
        processMatches(matches, m)  # print out matches