def process_phrasal(phrase, normalize=True):
    """
    Tokenize and stem the phrase words and compute the frequency of each word in the query list

    Arguments:
        phrase           string representing the phrasal query

    Returns: 
        A tuple containing the tokenized sentence and a counter object containing the counts
        for the terms in the query.
    """
    
    query_list = []
    sentences = nltk.sent_tokenize(phrase)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        if normalize:
            for word in words:
                normalized = text_processing.normalize(word)
                if normalized is not None:
                    query_list.append(normalized)
        else:
            query_list.extend(words)

    # count the frequency of each term
    query_count = Counter(query_list)
    return (query_list, query_count)
Beispiel #2
0
def process_query(query_str):
    """
    Tokenize and stem the query words and compute the frequency of each word in the query list

    Arguments:
        query_str       string of query words

    Returns: 
        query_count     a dictionary with the stemmed words and the its frequency in the query
    """
    
    query_list = []
    sentences = nltk.sent_tokenize(query_str)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        for word in words:            
            normalized = text_processing.normalize(word)
            if normalized is not None:
                query_list.append(normalized)
            
    # count the frequency of each term
    query_count = Counter(query_list)

    # set the tf value for each term
    query_weight = {}
    for query_term, term_count in query_count.items():
        query_weight[query_term] = 1 + math.log10(term_count)

    return query_weight
Beispiel #3
0
def process_phrasal(phrase, normalize=True):
    """
    Tokenize and stem the phrase words and compute the frequency of each word in the query list

    Arguments:
        phrase           string representing the phrasal query

    Returns: 
        A tuple containing the tokenized sentence and a counter object containing the counts
        for the terms in the query.
    """

    query_list = []
    sentences = nltk.sent_tokenize(phrase)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        if normalize:
            for word in words:
                normalized = text_processing.normalize(word)
                if normalized is not None:
                    query_list.append(normalized)
        else:
            query_list.extend(words)

    # count the frequency of each term
    query_count = Counter(query_list)
    return (query_list, query_count)
Beispiel #4
0
 def processText(text):
     words = text_processing.tokenize(text)
     words = text_processing.normalize(words)
     new_words = []
     for word in words:
         if word not in stopwords_:
             new_words.append(word)
     return new_words
 def processDescription(desc):
     """
     Process products' description
     """
     global idx
     idx += 1
     if (idx % 10000 == 0):
         print(idx)
     if type(desc) == str:
         text = desc
         text = text_processing.tokenize(text)
         text = text_processing.normalize(text)
         return text
Beispiel #6
0
    def __tokenize_string(self, the_string):
        """
            Tokenize and stem a string and remove punctuation and stopwords
        """
        word_list = []
        sentences = nltk.sent_tokenize(the_string)
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            for word in words:
                normalized = text_processing.normalize(word)
                if normalized is not None:
                    word_list.append(normalized)

        return word_list
 def __tokenize_string(self, the_string):
     """
         Tokenize and stem a string and remove punctuation and stopwords
     """
     word_list = []
     sentences = nltk.sent_tokenize(the_string)
     for sentence in sentences:
         words = nltk.word_tokenize(sentence)
         for word in words:
             normalized = text_processing.normalize(word)
             if normalized is not None:
                 word_list.append(normalized)
             
     return word_list
 def __init__(self, text_path, mel_dir, lin_dir, data_in_memory=True):
     self.data = []
     self.data_in_memory = data_in_memory
     if data_in_memory and os.path.exists(os.path.join(mel_dir, "all.npy")):
         mels = np.load(os.path.join(mel_dir, "all.npy"), allow_pickle=True)
     else:
         mels = None
     with io.open(text_path, "r", encoding="utf-8-sig") as f:
         lines = f.readlines()
         for i, line in enumerate(lines):
             line = line.split("|", maxsplit=1)
             text = line[1]
             text = text_processing.normalize(text)
             text = text + Config.vocab_end_of_text
             # Skip if text is too long
             if len(text) > Config.max_N:
                 print(
                     "Warning: Text with id '{}' is too long! Line will be skipped"
                     .format(line[0]))
                 continue
             text = text_processing.vocab_lookup(text)
             text = torch.tensor(text, dtype=torch.long)
             if data_in_memory:
                 if mels is not None:
                     mel = mels[i]
                     t = mel.shape[0]  # Needed for lin padding
                     mel = self._process_mel(mel)
                 else:
                     mel_path = os.path.join(mel_dir, line[0]) + ".npy"
                     mel = np.load(mel_path)
                     t = mel.shape[0]  # Needed for lin padding
                     mel = self._process_mel(np.load(mel_path))
                 # Skip if mel is too long
                 if mel.shape[0] > Config.max_T:
                     print(
                         "Warning: Mel with id '{}' is too long! Line will be skipped"
                         .format(line[0]))
                     continue
                 self.data.append({
                     "name": line[0],
                     "text": text,
                     "mel": mel,
                     "t": t
                 })
             else:
                 self.data.append({"name": line[0], "text": text})
     self.mel_dir = mel_dir
     self.lin_dir = lin_dir
Beispiel #9
0
def indexing(training_path, postings_file, dictionary_file, patent_info_file):
    """
    Create an index of the corpus at training_path, placing the index in
    dictionary_file, postings_file and patent_info_file.
    
    dictionary_file will contain a list of the terms contained in the corpus.
    On every line, the following information will be included (separated by spaces):
        <term indexed> <document frequency> <postings pointer>
    The postings pointer is a line number in the postings file.
    Every line of the postings file is a postings list for the term that points to that
    line. A line contains several entries. Each entry has the following form:
        <patent ID> <log tf> <tf> <list of positions>
    The list of positions indicates the word positions at which the indexed word can
    be found in the given patent (a standard positional index). The list is always
    <tf> elements long.
    """
    pats = sorted(os.listdir(training_path))

    pi = open(patent_info_file, 'w')
    postings = dict()
    for pat in pats:
        tree = et.parse(os.path.join(training_path, pat))

        pat_id = os.path.splitext(pat)[0]

        root = tree.getroot()
        title_content = ""
        abstract_content = ""

        year = ""
        cites = "0"
        ipc = ""
        inventor = ""
        content = ""
        for child in root:
            # extract patent content
            if child.get('name') == 'Title':
                content += child.text.encode('utf-8') + " "

            if child.get('name') == 'Abstract':
                content += child.text.encode('utf-8') + " "

            # extract patent info (meta data)
            if child.get('name') == 'Publication Year':
                year = child.text.encode('utf-8').strip()

            if child.get('name') == 'Cited By Count':
                cites = child.text.encode('utf-8').strip()

            if child.get('name') == 'IPC Primary':
                ipc = child.text.encode('utf-8').strip()

            if child.get('name') == '1st Inventor':
                inventor = child.text.encode('utf-8').strip()

        pi.write(pat_id + " | " + year + " | " + cites + " | " + ipc + " | " +
                 inventor + "\n")

        stemmer = PorterStemmer()
        # remove non utf-8 characters, http://stackoverflow.com/a/20078869
        content = re.sub(r'[^\x00-\x7F]+', ' ', content)

        sentences = nltk.sent_tokenize(content)
        i = 0
        occurences = dict()
        for sentence in sentences:
            # tokenize sentences in words
            words = nltk.word_tokenize(sentence)
            for word in words:
                normalized = text_processing.normalize(word)
                if normalized is None:
                    continue

                if normalized in occurences:
                    occurences[normalized].append(i)
                else:
                    occurences[normalized] = [i]

                i += 1

        for word, positions in occurences.iteritems():
            if word in postings:
                postings[word].append((pat_id, positions))
            else:
                postings[word] = [(pat_id, positions)]

    pi.close()
    write_dict_postings(postings, postings_file, dictionary_file,
                        training_path)
Beispiel #10
0
    def test_normalize(self):
        test_str = "This is an example."
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "this is an example.")

        test_str = "   EXTRA   SPACE   "
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "extra space")

        test_str = "THIS IS ALL CAPS!!"
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "this is all caps!!")

        test_str = "                   "
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "")

        test_str = "this is all lower space..."
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "this is all lower space...")

        test_str = "  H  e  L    l   O   !"
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "h e l l o !")

        test_str = ""
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "")

        test_str = "........"
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "........")

        test_str = "EX  A M P     LE"
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "ex a m p le")

        test_str = "Test Text Normalization"
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "test text normalization")

        test_str = "AbCd EfGh IjKl MnOp"
        pred = tp.normalize(test_str)
        self.assertEqual(pred, "abcd efgh ijkl mnop")
Beispiel #11
0
def processDescription(desc):
    if type(desc)==str:
        text = desc
        text = text_processing.tokenize(text)
        text = text_processing.normalize(text)
        return text
Beispiel #12
0
def fparse(kv):
    return kv[0], compress(unidecode(normalize(bytes_to_text(kv[1]).lower())))
Beispiel #13
0
def process_raw_text(text):
    return normalize(extras.sub('', pAt.sub('', replaceURLs('', text)))).strip()
def indexing(training_path, postings_file, dictionary_file, patent_info_file):
    """
    Create an index of the corpus at training_path, placing the index in
    dictionary_file, postings_file and patent_info_file.
    
    dictionary_file will contain a list of the terms contained in the corpus.
    On every line, the following information will be included (separated by spaces):
        <term indexed> <document frequency> <postings pointer>
    The postings pointer is a line number in the postings file.
    Every line of the postings file is a postings list for the term that points to that
    line. A line contains several entries. Each entry has the following form:
        <patent ID> <log tf> <tf> <list of positions>
    The list of positions indicates the word positions at which the indexed word can
    be found in the given patent (a standard positional index). The list is always
    <tf> elements long.
    """
    pats = sorted(os.listdir(training_path))
    
    pi = open(patent_info_file, 'w')
    postings = dict()
    for pat in pats:
        tree = et.parse(os.path.join(training_path, pat))
        
        pat_id = os.path.splitext(pat)[0]
        
        root = tree.getroot()
        title_content = ""
        abstract_content = ""

        year = ""
        cites = "0"
        ipc = ""
        inventor = ""
        content = ""
        for child in root:
            # extract patent content
            if child.get('name') == 'Title':
                content += child.text.encode('utf-8') + " "

            if child.get('name') == 'Abstract':
                content += child.text.encode('utf-8') + " "

            # extract patent info (meta data)
            if child.get('name') == 'Publication Year':
                year = child.text.encode('utf-8').strip()

            if child.get('name') == 'Cited By Count':
                cites = child.text.encode('utf-8').strip()

            if child.get('name') == 'IPC Primary':
                ipc = child.text.encode('utf-8').strip()

            if child.get('name') == '1st Inventor':
                inventor = child.text.encode('utf-8').strip()

        pi.write(pat_id + " | " + year + " | " + cites +  " | " + ipc + " | " + inventor + "\n")
        
        stemmer = PorterStemmer()
        # remove non utf-8 characters, http://stackoverflow.com/a/20078869
        content = re.sub(r'[^\x00-\x7F]+',' ', content)

        sentences = nltk.sent_tokenize(content)
        i = 0
        occurences = dict()
        for sentence in sentences:
            # tokenize sentences in words
            words = nltk.word_tokenize(sentence)
            for word in words:
                normalized = text_processing.normalize(word)
                if normalized is None:
                    continue
                
                if normalized in occurences:
                    occurences[normalized].append(i)
                else:
                    occurences[normalized] = [i]
                
                i += 1
        
        for word, positions in occurences.iteritems():
            if word in postings:
                postings[word].append((pat_id, positions))
            else:
                postings[word] = [(pat_id, positions)]
        
    pi.close()
    write_dict_postings(postings, postings_file, dictionary_file, training_path)