def process_phrasal(phrase, normalize=True): """ Tokenize and stem the phrase words and compute the frequency of each word in the query list Arguments: phrase string representing the phrasal query Returns: A tuple containing the tokenized sentence and a counter object containing the counts for the terms in the query. """ query_list = [] sentences = nltk.sent_tokenize(phrase) for sentence in sentences: words = nltk.word_tokenize(sentence) if normalize: for word in words: normalized = text_processing.normalize(word) if normalized is not None: query_list.append(normalized) else: query_list.extend(words) # count the frequency of each term query_count = Counter(query_list) return (query_list, query_count)
def process_query(query_str): """ Tokenize and stem the query words and compute the frequency of each word in the query list Arguments: query_str string of query words Returns: query_count a dictionary with the stemmed words and the its frequency in the query """ query_list = [] sentences = nltk.sent_tokenize(query_str) for sentence in sentences: words = nltk.word_tokenize(sentence) for word in words: normalized = text_processing.normalize(word) if normalized is not None: query_list.append(normalized) # count the frequency of each term query_count = Counter(query_list) # set the tf value for each term query_weight = {} for query_term, term_count in query_count.items(): query_weight[query_term] = 1 + math.log10(term_count) return query_weight
def processText(text): words = text_processing.tokenize(text) words = text_processing.normalize(words) new_words = [] for word in words: if word not in stopwords_: new_words.append(word) return new_words
def processDescription(desc): """ Process products' description """ global idx idx += 1 if (idx % 10000 == 0): print(idx) if type(desc) == str: text = desc text = text_processing.tokenize(text) text = text_processing.normalize(text) return text
def __tokenize_string(self, the_string): """ Tokenize and stem a string and remove punctuation and stopwords """ word_list = [] sentences = nltk.sent_tokenize(the_string) for sentence in sentences: words = nltk.word_tokenize(sentence) for word in words: normalized = text_processing.normalize(word) if normalized is not None: word_list.append(normalized) return word_list
def __init__(self, text_path, mel_dir, lin_dir, data_in_memory=True): self.data = [] self.data_in_memory = data_in_memory if data_in_memory and os.path.exists(os.path.join(mel_dir, "all.npy")): mels = np.load(os.path.join(mel_dir, "all.npy"), allow_pickle=True) else: mels = None with io.open(text_path, "r", encoding="utf-8-sig") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.split("|", maxsplit=1) text = line[1] text = text_processing.normalize(text) text = text + Config.vocab_end_of_text # Skip if text is too long if len(text) > Config.max_N: print( "Warning: Text with id '{}' is too long! Line will be skipped" .format(line[0])) continue text = text_processing.vocab_lookup(text) text = torch.tensor(text, dtype=torch.long) if data_in_memory: if mels is not None: mel = mels[i] t = mel.shape[0] # Needed for lin padding mel = self._process_mel(mel) else: mel_path = os.path.join(mel_dir, line[0]) + ".npy" mel = np.load(mel_path) t = mel.shape[0] # Needed for lin padding mel = self._process_mel(np.load(mel_path)) # Skip if mel is too long if mel.shape[0] > Config.max_T: print( "Warning: Mel with id '{}' is too long! Line will be skipped" .format(line[0])) continue self.data.append({ "name": line[0], "text": text, "mel": mel, "t": t }) else: self.data.append({"name": line[0], "text": text}) self.mel_dir = mel_dir self.lin_dir = lin_dir
def indexing(training_path, postings_file, dictionary_file, patent_info_file): """ Create an index of the corpus at training_path, placing the index in dictionary_file, postings_file and patent_info_file. dictionary_file will contain a list of the terms contained in the corpus. On every line, the following information will be included (separated by spaces): <term indexed> <document frequency> <postings pointer> The postings pointer is a line number in the postings file. Every line of the postings file is a postings list for the term that points to that line. A line contains several entries. Each entry has the following form: <patent ID> <log tf> <tf> <list of positions> The list of positions indicates the word positions at which the indexed word can be found in the given patent (a standard positional index). The list is always <tf> elements long. """ pats = sorted(os.listdir(training_path)) pi = open(patent_info_file, 'w') postings = dict() for pat in pats: tree = et.parse(os.path.join(training_path, pat)) pat_id = os.path.splitext(pat)[0] root = tree.getroot() title_content = "" abstract_content = "" year = "" cites = "0" ipc = "" inventor = "" content = "" for child in root: # extract patent content if child.get('name') == 'Title': content += child.text.encode('utf-8') + " " if child.get('name') == 'Abstract': content += child.text.encode('utf-8') + " " # extract patent info (meta data) if child.get('name') == 'Publication Year': year = child.text.encode('utf-8').strip() if child.get('name') == 'Cited By Count': cites = child.text.encode('utf-8').strip() if child.get('name') == 'IPC Primary': ipc = child.text.encode('utf-8').strip() if child.get('name') == '1st Inventor': inventor = child.text.encode('utf-8').strip() pi.write(pat_id + " | " + year + " | " + cites + " | " + ipc + " | " + inventor + "\n") stemmer = PorterStemmer() # remove non utf-8 characters, http://stackoverflow.com/a/20078869 content = re.sub(r'[^\x00-\x7F]+', ' ', content) sentences = nltk.sent_tokenize(content) i = 0 occurences = dict() for sentence in sentences: # tokenize sentences in words words = nltk.word_tokenize(sentence) for word in words: normalized = text_processing.normalize(word) if normalized is None: continue if normalized in occurences: occurences[normalized].append(i) else: occurences[normalized] = [i] i += 1 for word, positions in occurences.iteritems(): if word in postings: postings[word].append((pat_id, positions)) else: postings[word] = [(pat_id, positions)] pi.close() write_dict_postings(postings, postings_file, dictionary_file, training_path)
def test_normalize(self): test_str = "This is an example." pred = tp.normalize(test_str) self.assertEqual(pred, "this is an example.") test_str = " EXTRA SPACE " pred = tp.normalize(test_str) self.assertEqual(pred, "extra space") test_str = "THIS IS ALL CAPS!!" pred = tp.normalize(test_str) self.assertEqual(pred, "this is all caps!!") test_str = " " pred = tp.normalize(test_str) self.assertEqual(pred, "") test_str = "this is all lower space..." pred = tp.normalize(test_str) self.assertEqual(pred, "this is all lower space...") test_str = " H e L l O !" pred = tp.normalize(test_str) self.assertEqual(pred, "h e l l o !") test_str = "" pred = tp.normalize(test_str) self.assertEqual(pred, "") test_str = "........" pred = tp.normalize(test_str) self.assertEqual(pred, "........") test_str = "EX A M P LE" pred = tp.normalize(test_str) self.assertEqual(pred, "ex a m p le") test_str = "Test Text Normalization" pred = tp.normalize(test_str) self.assertEqual(pred, "test text normalization") test_str = "AbCd EfGh IjKl MnOp" pred = tp.normalize(test_str) self.assertEqual(pred, "abcd efgh ijkl mnop")
def processDescription(desc): if type(desc)==str: text = desc text = text_processing.tokenize(text) text = text_processing.normalize(text) return text
def fparse(kv): return kv[0], compress(unidecode(normalize(bytes_to_text(kv[1]).lower())))
def process_raw_text(text): return normalize(extras.sub('', pAt.sub('', replaceURLs('', text)))).strip()
def indexing(training_path, postings_file, dictionary_file, patent_info_file): """ Create an index of the corpus at training_path, placing the index in dictionary_file, postings_file and patent_info_file. dictionary_file will contain a list of the terms contained in the corpus. On every line, the following information will be included (separated by spaces): <term indexed> <document frequency> <postings pointer> The postings pointer is a line number in the postings file. Every line of the postings file is a postings list for the term that points to that line. A line contains several entries. Each entry has the following form: <patent ID> <log tf> <tf> <list of positions> The list of positions indicates the word positions at which the indexed word can be found in the given patent (a standard positional index). The list is always <tf> elements long. """ pats = sorted(os.listdir(training_path)) pi = open(patent_info_file, 'w') postings = dict() for pat in pats: tree = et.parse(os.path.join(training_path, pat)) pat_id = os.path.splitext(pat)[0] root = tree.getroot() title_content = "" abstract_content = "" year = "" cites = "0" ipc = "" inventor = "" content = "" for child in root: # extract patent content if child.get('name') == 'Title': content += child.text.encode('utf-8') + " " if child.get('name') == 'Abstract': content += child.text.encode('utf-8') + " " # extract patent info (meta data) if child.get('name') == 'Publication Year': year = child.text.encode('utf-8').strip() if child.get('name') == 'Cited By Count': cites = child.text.encode('utf-8').strip() if child.get('name') == 'IPC Primary': ipc = child.text.encode('utf-8').strip() if child.get('name') == '1st Inventor': inventor = child.text.encode('utf-8').strip() pi.write(pat_id + " | " + year + " | " + cites + " | " + ipc + " | " + inventor + "\n") stemmer = PorterStemmer() # remove non utf-8 characters, http://stackoverflow.com/a/20078869 content = re.sub(r'[^\x00-\x7F]+',' ', content) sentences = nltk.sent_tokenize(content) i = 0 occurences = dict() for sentence in sentences: # tokenize sentences in words words = nltk.word_tokenize(sentence) for word in words: normalized = text_processing.normalize(word) if normalized is None: continue if normalized in occurences: occurences[normalized].append(i) else: occurences[normalized] = [i] i += 1 for word, positions in occurences.iteritems(): if word in postings: postings[word].append((pat_id, positions)) else: postings[word] = [(pat_id, positions)] pi.close() write_dict_postings(postings, postings_file, dictionary_file, training_path)