def load_corpus(self, corpus_dir): """Load corpus from a given directory, then initialize the documents and model. Line format: token1 \t token2 \t token3 \t ... ... """ self.documents = [] rand = random.Random() logging.info('Load corpus from %s.' % corpus_dir) for root, dirs, files in os.walk(corpus_dir): for f in files: filename = os.path.join(root, f) logging.info('Load filename %s.' % filename) fp = open(filename, 'r') for doc_str in fp.readlines(): doc_str = doc_str.decode('gbk') doc_tokens = doc_str.strip().split('\t') if len(doc_tokens) < 2: continue document = Document(self.model.num_topics) document.parse_from_tokens(doc_tokens, rand, self.vocabulary) if document.num_words() < 2: continue self.documents.append(document) fp.close() logging.info('The document number is %d.' % len(self.documents)) self._initialize_model() self._compute_smoothing_only_bucket() self._initialize_topic_word_coefficient()
def _inference_one_chain(self, doc_tokens, rand): """Inference topics with one markov chain. Returns the sparse topics p(z|d). """ document = Document(self.model.num_topics) document.parse_from_tokens(doc_tokens, rand, self.vocabulary, self.model) if document.num_words() == 0: return dict() accumulated_topic_hist = {} for i in xrange(self.total_iterations): # one iteration for word in document.get_words(): # -- document.decrease_topic(word.topic, 1) new_topic = self._sample_word_topic(document, word.id, rand) assert new_topic != None word.topic = new_topic # ++ document.increase_topic(new_topic, 1) if i >= self.burn_in_iterations: for non_zero in document.doc_topic_hist.get_non_zeros(): if non_zero.topic in accumulated_topic_hist: accumulated_topic_hist[ non_zero.topic] += non_zero.count else: accumulated_topic_hist[non_zero.topic] = non_zero.count topic_dist = self._l1normalize_distribution(accumulated_topic_hist) return topic_dist
def _inference_one_chain(self, doc_tokens, rand): """Inference topics with one markov chain. Returns the sparse topics p(z|d). """ document = Document(self.model.num_topics) document.parse_from_tokens(doc_tokens, rand, self.vocabulary, self.model) if document.num_words() == 0: return dict() accumulated_topic_hist = {} for i in xrange(self.total_iterations): # one iteration for word in document.get_words(): # -- document.decrease_topic(word.topic, 1) new_topic = self._sample_word_topic(document, word.id, rand) assert new_topic != None word.topic = new_topic # ++ document.increase_topic(new_topic, 1) if i >= self.burn_in_iterations: for non_zero in document.doc_topic_hist.get_non_zeros(): if non_zero.topic in accumulated_topic_hist: accumulated_topic_hist[non_zero.topic] += non_zero.count else: accumulated_topic_hist[non_zero.topic] = non_zero.count topic_dist = self._l1normalize_distribution(accumulated_topic_hist) return topic_dist
def test_compute_loglikelihood(self): doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null'] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens( doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual(-14.113955684239654, self.model_evaluator.compute_loglikelihood(documents))
def get_document(self): self.__doc = Document(text=self.__text) self.__doc.set_emails(self.__emails) self.__doc.set_links(self.__links) self.__doc.set_hash_tags(self.__hash_tags) self.__doc.set_tokens(self.__tokens) self.__doc.set_sentences(list(self.__blob.sentences)) return self.__doc
def _load_corpus(self, corpus_dir): self.documents = [] if not os.path.exists(corpus_dir): logging.error('The corpus directory %s does not exists.' % corpus_dir) return False for root, dirs, files in os.walk(corpus_dir): for f in files: filename = os.path.join(root, f) fp = open(filename, 'rb') record_reader = RecordReader(fp) while True: blob = record_reader.read() if blob == None: break document = Document(self.model.num_topics) document.parse_from_string(blob) self.documents.append(document) return True
def load_corpus(instance_folder_path: str, non_instance_folder_path: str) -> Corpus: files = [(file, DocumentClass.INSTANCE) for file in listdir(instance_folder_path) if isfile(join(instance_folder_path, file))] \ + [(file, DocumentClass.NON_INSTANCE) for file in listdir(non_instance_folder_path) if isfile(join(non_instance_folder_path, file))] documents = [] for file, is_instance in files: folder_path = instance_folder_path if is_instance == DocumentClass.INSTANCE else non_instance_folder_path file_path = join(folder_path, file) with open(file_path, 'r') as document: doc = Document(document.read(), is_instance) documents.append(doc) return Corpus(documents)
def load_documents(folder_path: str) -> [Document]: files = [ file for file in listdir(folder_path) if isfile(join(folder_path, file)) ] documents = [] for file in files: with open(join(folder_path, file), 'r') as document: doc = Document(document.read(), DocumentClass.UNKNOWN) documents.append(doc) return documents
def read_corenlp_doc(filename, verbose=True): if verbose: log.info('Reading CoreNLP document from {}'.format(filename)) input_xml = smart_file_handler(filename) xml_parser = etree.XMLParser(target=CoreNLPTarget()) sents, corefs = etree.parse(input_xml, xml_parser) doc_name = splitext(basename(filename))[0] doc = Document.construct(doc_name, sents, corefs) input_xml.close() return doc
def read_doc_from_ontonotes(coref_doc, name_doc, verbose=True): doc_id = coref_doc.document_id.split('@')[0] assert doc_id == name_doc.document_id.split('@')[0], \ '{} and {} do not have the same document_id'.format(coref_doc, name_doc) if verbose: log.info('Reading ontonotes document {}'.format(doc_id)) conll_file_path = join(ontonotes_annotations_source, doc_id + '.depparse') all_sents = read_conll_depparse(conll_file_path) all_corefs = read_coref_doc(coref_doc) doc_name = doc_id.split('/')[-1] doc = Document.construct(doc_name, all_sents, all_corefs) for name_entity in read_name_doc(name_doc): add_name_entity_to_doc(doc, name_entity) return doc
class GraphematicalAnalysis: def __init__(self, text=None): self.__text = text self.__blob = None self.__tokens = [] self.__DEL = [' ', ' ', ' ', '\t', '\n'] self.__SIG = [ '.', ',', '-', '—', '!', '?', ';', ':', '(', ')', '[', ']', '{', '}' ] self.__SYM = ['«', '»', '\"', '\"', '\"', '``', '\'\''] self.__RLE = [ 'й', 'ц', 'у', 'к', 'е', 'н', 'г', 'ш', 'щ', 'з', 'х', 'ъ', 'ф', 'ы', 'в', 'а', 'п', 'р', 'о', 'л', 'д', 'ж', 'э', 'я', 'ч', 'с', 'м', 'и', 'т', 'ь', 'б', 'ю', 'ё', 'Й', 'Ц', 'У', 'К', 'Е', 'Н', 'Г', 'Ш', 'Щ', 'З', 'Х', 'Ъ', 'Ф', 'Ы', 'В', 'А', 'П', 'Р', 'О', 'Л', 'Д', 'Ж', 'Э', 'Я', 'Ч', 'С', 'М', 'И', 'Т', 'Ь', 'Б', 'Ю', 'Ё' ] self.__LLE = [ 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M' ] self.__DC = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] self.__END = ['.', '!', '?'] self.__emails = [] self.__hash_tags = [] self.__links = [] self.__tokenization_result = [] self.__doc = None self.__regexp_dir = os.getcwd() + '/common/' def __pre_processing(self): # Clear all previous results self.__tokens = [] self.__tokenization_result = [] # Extract all of emails & replace them with '__EMAIL' self.__emails = self.extract_email_addresses(self.__text) for email in self.__emails: self.__text = self.__text.replace(email, '__EMAIL') # Extract all of hashtags & replace them with '__HASHTAG' self.__hash_tags = self.extract_hash_tags(self.__text) for tag in self.__hash_tags: self.__text = self.__text.replace(tag, '__HASHTAG') # Extract all of links & replace them with '__LINK' self.__links = self.extract_links(self.__text) for link in self.__links: self.__text = self.__text.replace(link, '__LINK') self.__blob = TextBlob(self.__text) # Replacing quotes like '``' with "\"" for token in self.__blob.tokens: new_token = str(token) if new_token.startswith('``'): new_token = "\"" elif new_token.endswith('\'\''): new_token = "\"" self.__tokenization_result.append(new_token) # Removing unicode special character in first token # self.__tokenization_result[0] = self.__tokenization_result[0][1:] quotes = OrderedDict() # Searching for quotes, deleting them & remember their positions for i in range(0, len(self.__tokenization_result)): s = str(self.__tokenization_result[i]) if self.__tokenization_result[i].startswith( "«") or self.__tokenization_result[i].startswith("\""): if len(self.__tokenization_result[i]) > 2: # Remember first symbol quotes[i + len(quotes)] = self.__tokenization_result[i][0] # Delete first symbol self.__tokenization_result[i] = self.__tokenization_result[ i][1:] if self.__tokenization_result[i].endswith( "»") or self.__tokenization_result[i].endswith("\""): if len(self.__tokenization_result[i]) > 2: # Remember last symbol quotes[i + len(quotes) + 1] = self.__tokenization_result[i][ len(self.__tokenization_result[i]) - 1] # Delete last symbol self.__tokenization_result[i] = self.__tokenization_result[ i][0:-1] # Inserting quotes as individual q for key in quotes.keys(): self.__tokenization_result.insert(key, quotes[key]) def analysis(self): self.__pre_processing() current_email = 0 current_hash_tag = 0 current_link = 0 # Add descriptors & labels for each token in the text for raw_token in self.__tokenization_result: if raw_token == '__EMAIL': raw_token = self.__emails[current_email] current_email += 1 elif raw_token == '__HASHTAG': raw_token = self.__hash_tags[current_hash_tag] current_hash_tag += 1 elif raw_token == '__LINK': raw_token = self.__links[current_link] current_link += 1 if self.index_of_any(raw_token, self.__DEL): # Delimiter token = Token(text=raw_token, grapheme=Grapheme.DEL) # labels token.add_label(Label.SPACE) self.__tokens.append(token) elif self.index_of_any(raw_token, self.__RLE): # Russian lexeme token = Token(text=raw_token, grapheme=Grapheme.RLE) # labels token.add_label(Label.WORD) token.add_label(Label.CYRIL) self.__tokens.append(token) elif self.index_of_any(raw_token, self.__SYM): # Symbol token = Token(text=raw_token, grapheme=Grapheme.SYM) # labels token.add_label(Label.QUOTE) token.add_label(Label.MARKUP) if raw_token == "«" or raw_token == "\"": token.add_label(Label.OPENING) elif raw_token == "»" or raw_token == "\"": token.add_label(Label.CLOSING) self.__tokens.append(token) elif self.index_of_any(raw_token, self.__LLE): # Latin lexeme token = Token(text=raw_token, grapheme=Grapheme.LLE) # labels token.add_label(Label.WORD) token.add_label(Label.LATIN) self.__tokens.append(token) elif self.index_of_any(raw_token, self.__DC): # Digits complex token = Token(text=raw_token, grapheme=Grapheme.DC) # labels token.add_label(Label.NUMBER) self.__tokens.append(token) else: if self.index_of_any(raw_token, self.__SIG): # Signum token = Token(text=raw_token, grapheme=Grapheme.SIG) # labels token.add_label(Label.PUNCT) if raw_token == "(" or raw_token == "[" or raw_token == '{': token.add_label(Label.OPENING) elif raw_token == ")" or raw_token == "]" or raw_token == '}': token.add_label(Label.CLOSING) self.__tokens.append(token) else: # Composite token TODO: add #hashtag, email, phone labels token = Token(text=raw_token, grapheme=Grapheme.COMPOSITE) # labels if raw_token in self.__emails: token.add_label(Label.EMAIL) elif raw_token in self.__hash_tags: token.add_label(Label.HASHTAG) elif raw_token in self.__links: token.add_label(Label.LINK) elif self.is_word_with_a_hyphen(raw_token): token.add_label(Label.WORD) token.add_label(Label.CYRIL) else: token.add_label(Label.OTHER) self.__tokens.append(token) space_token = Token(text=" ", grapheme=Grapheme.DEL) space_token.add_label(Label.SPACE) # self.__tokens.append(space_token) return self.__tokens @staticmethod def index_of_any(source, dictionary): for i in range(0, len(source)): if source[i] not in dictionary: return False return True @staticmethod def intersects(source, dictionary): for i in range(0, len(source)): if source[i] in dictionary: return True return False @staticmethod def extract_email_addresses(string): r = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-z]{2,5})") return r.findall(string) @staticmethod def is_word_with_a_hyphen(string): r = re.findall(r'[\w-]+[\w-]', string) return len(r) > 0 def extract_links(self, string): with open(self.__regexp_dir + 'link_regexp.txt', 'r') as f: pattern = f.read() r = re.compile(pattern) return r.findall(string) @staticmethod def extract_hash_tags(string): r = re.compile(r'#\w*') return r.findall(string) def set_text(self, text): self.__text = text def get_tokens(self): # self.__tokens.pop() return self.__tokens def get_emails(self): return self.__emails def get_links(self): return self.__links def get_hash_tags(self): return self.__hash_tags def get_document(self): self.__doc = Document(text=self.__text) self.__doc.set_emails(self.__emails) self.__doc.set_links(self.__links) self.__doc.set_hash_tags(self.__hash_tags) self.__doc.set_tokens(self.__tokens) self.__doc.set_sentences(list(self.__blob.sentences)) return self.__doc