def full_search(hashedData, pat, m=20):
    """Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches."""

    # Hash words in pattern
    pHashed = []
    pProcessed = []
    matches = []

    for word in pat.split():
        new = word.translate(string.maketrans("", ""), string.punctuation).upper()
        pProcessed.append(new)
        pHashed.append(letsHash(new))

    # for each m-tuple in corpus
    for k, txtMtuple in enumerate(izip(*[iter(hashedData[i:]) for i in xrange(m)])):

        # for m-tuples in pattern -- might just use izip here
        for i in range(len(pHashed) - m + 1):  # first word in seqs

            seq = pHashed[i : i + m]

            broken = m  # not broken
            for j, hashedWord in enumerate(seq):

                if hashedWord != txtMtuple[j]:
                    broken = j
                    break

            if broken == m:  # was not redefined
                matches.append((k, " ".join(pProcessed[i : i + m])))

    if len(matches) > 0:
        processMatches(matches, m)  # print out matches
Beispiel #2
0
def full_search(hashedData, pat, m=20):
    '''Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches.'''

    # Hash words in pattern
    pHashed = []
    pProcessed = []
    matches = []

    for word in (pat.split()):
        new = word.translate(string.maketrans("", ""),
                             string.punctuation).upper()
        pProcessed.append(new)
        pHashed.append(letsHash(new))

    # for each m-tuple in corpus
    for k, txtMtuple in enumerate(
            izip(*[iter(hashedData[i:]) for i in xrange(m)])):

        # for m-tuples in pattern -- might just use izip here
        for i in range(len(pHashed) - m + 1):  # first word in seqs

            seq = pHashed[i:i + m]

            broken = m  # not broken
            for j, hashedWord in enumerate(seq):

                if hashedWord != txtMtuple[j]:
                    broken = j
                    break

            if broken == m:  # was not redefined
                matches.append((k, ' '.join(pProcessed[i:i + m])))

    if len(matches) > 0:
        processMatches(matches, m)  # print out matches
def hashPat(pat):
  """ Hash the pattern """

  # Hash words in pattern
  pHashed = []
  pProcessed = []

  for word in (pat.split()):
    new = word.translate(string.maketrans("",""), string.punctuation).upper()
    pProcessed.append(new)
    pHashed.append(letsHash(new, q=1009, d=26))
  return pHashed, pProcessed
def processData(hashedData, pat, m, rank, comm):

  """ Each process hashes the pattern and searches through part of corpus for matches of length m words"""


  # Hash words in pattern
  pHashed = []
  pProcessed = []
  matches = []

  for word in (pat.split()):
    new = word.translate(string.maketrans("",""), string.punctuation).upper()
    pProcessed.append(new)
    pHashed.append(letsHash(new, q=1009, d=26))


  # for each m-tuple in corpus
  for k,txtMtuple in enumerate(izip(*[iter(hashedData[i:]) for i in xrange(m)])):

    # for m-tuples in pattern -- might just use izip here
    for i in range(len(pHashed)-m+1): # first word in seqs

      seq = pHashed[i:i+m]

      broken = m # not broken
      for j,hashedWord in enumerate(seq):

        if hashedWord != txtMtuple[j]:
          broken = j
          break


      if broken == m: # was not redefined
          matches.append((k,' '.join(pProcessed[i:i+m])))

  if len(matches) > 0:
    processMatches(matches,m) # print out matches

    return time.time()
Beispiel #5
0
def processData(hashedData, pat, m, rank, comm):
    """ Each process hashes the pattern and searches through part of corpus for matches of length m words"""

    # Hash words in pattern
    pHashed = []
    pProcessed = []
    matches = []

    for word in (pat.split()):
        new = word.translate(string.maketrans("", ""),
                             string.punctuation).upper()
        pProcessed.append(new)
        pHashed.append(letsHash(new, q=1009, d=26))

    # for each m-tuple in corpus
    for k, txtMtuple in enumerate(
            izip(*[iter(hashedData[i:]) for i in xrange(m)])):

        # for m-tuples in pattern -- might just use izip here
        for i in range(len(pHashed) - m + 1):  # first word in seqs

            seq = pHashed[i:i + m]

            broken = m  # not broken
            for j, hashedWord in enumerate(seq):

                if hashedWord != txtMtuple[j]:
                    broken = j
                    break

            if broken == m:  # was not redefined
                matches.append((k, ' '.join(pProcessed[i:i + m])))

    if len(matches) > 0:
        processMatches(matches, m)  # print out matches

        return time.time()