def parse_file(pid, input_queue, output_queue): twitter = tag.Twitter() def tagger(sent): return [word for word, tag in twitter.pos(sent) if tag not in ['Punctuation', 'Unknown']] while True: get = input_queue.get() if get is None: output_queue.put(pid) break with open(get) as f: lines = f.readlines() sent = [] for line in lines: if line == '\n' or line.find('<doc') != -1: continue if line.find('doc>') != -1: output_queue.put(' '.join(sent)) sent = [] continue line = line.strip() try: sent.append(' '.join(tagger(line))) except Exception as e: print('Exception occured : "{}"'.format(line)) print(e) output_queue.put(pid) break
def __init__(self, new=False): self.dictionary = "" self.model = "" self.lock = threading.Lock() self.twitter = tag.Twitter() self.tokenize_cache = dict() if not new: self.load_from_file(LDA_PICKLE_PATH) self.topic_cache = dict()
def __init__(self, tagger, useful_tags, delimiters, stopwords, **kwargs): if tagger == 'twitter': self.tagger = taggers.Twitter() self.tagger_options = { 'norm': bool(kwargs.get('norm', True)), 'stem': bool(kwargs.get('stem', True)), } else: raise kwchatbotError("available tagger is: twitter") self.useful_tags = useful_tags self.delimiters = delimiters self.stopwords = stopwords self.splitter = self.splitterer() self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
def __init__(self, tagger, useful_tags, delimiters, min_token_length, stopwords, **kwargs): if tagger == 'twitter': self.tagger = taggers.Twitter() self.tagger_options = { 'norm': bool(kwargs.get('norm', True)), 'stem': bool(kwargs.get('stem', True)), } elif tagger == 'komoran': self.tagger = taggers.Komoran() self.tagger_options = { 'flatten': bool(kwargs.get('flatten', True)), } elif tagger == 'hannanum': self.tagger = taggers.Hannanum() self.tagger_options = { 'ntags': int(kwargs.get('ntags', 9)), 'flatten': bool(kwargs.get('flatten', True)), } elif tagger == 'kkma': self.tagger = taggers.Kkma() self.tagger_options = { 'flatten': bool(kwargs.get('flatten', True)), } # 윈도에선 mecab을 지원하지 않으니 이하 생략하도록 한다. # elif tagger == 'mecab': # self.tagger = taggers.Mecab() # self.tagger_options = { # 'flatten': bool(kwargs.get('flatten', True)), # } else: raise LexRankError( "available taggers are: twitter, komoran, hannanum, kkma") self.useful_tags = useful_tags self.delimiters = delimiters self.stopwords = stopwords self.min_token_length = min_token_length self.splitter = self.splitterer() self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
def S2ECorpus(raw_corpus_path, entity_dict, save_path, idx2word): # 1. Load all documents doc = load_csv_euc_kr(raw_corpus_path) # [ int->List[String] ] e2s_dict = dict() def push(entity_id, sent): if entity_id not in e2s_dict: e2s_dict[entity_id] = list() e2s_dict[entity_id].append(sent) all_sentence = [] # 2. Generate Entity - Sentence Dictionary # Remove entity from the sentence print("Iterating docs...") bar = ListProgress(doc) for article in doc: bar.step() content = article[IDX_TITLE] + "\n" + article[IDX_CONTENT] for sentence in split_sentence(content): matching_entitys = entity_dict.extract_from(sentence) for entity in matching_entitys: sent_removed = remove_keyword(sentence, entity) entity_id = entity_dict.get_group(entity) push(entity_id, sent_removed) all_sentence.append(sent_removed) if len(matching_entitys) == 0: push(0, sentence) all_sentence.append(sentence) def pick_random_sentence(): return random.choice(all_sentence) twitter = tag.Twitter() def to_index(str): poses = twitter.pos(str, True, True) res = [idx2word.word2idx(pos[0]) for pos in poses] return res # Corpus Gen # # 3. Sentence / Entity / Label print("Generating corpus...") corpus = [] bar = ListProgress(e2s_dict.keys()) for key in e2s_dict.keys(): bar.step() if key == 0: continue related_sentences = e2s_dict[key] # Positive Corpus for sent in related_sentences: corpus.append((key, to_index(sent), 1)) # print((entity_dict.get_name(key),sent,1)) # Negative Corpus for i in range(len(related_sentences)): sent = pick_random_sentence() while sent in related_sentences: sent = pick_random_sentence() corpus.append((key, to_index(sent), 0)) #print((entity_dict.get_name(key), sent, 0)) print("Saving pickle...") pickle.dump(corpus, open(save_path, "wb"))
def gen_corpus(): idx2word = Idx2Word("data\\idx2word") entity_dict = EntityDict("..\\input\\EntityDict.txt") target_size = 10000 raw_corpus = load_csv_euc_kr("..\\input\\bobae_car_euc.csv")[:target_size * 10 ] target_dictionary = pickle.load(open("data\\targetDict.p","rb")) def get_thread_article_id(post): return post[IDX_THREAD_ID], post[IDX_ARTICLE_ID] def get_content(article): return article[IDX_TITLE] + "\n" + article[IDX_CONTENT] sent_dict = dict() for sent in raw_corpus: sent_dict[get_thread_article_id(sent)] = sent def get_prev_sent(article): id = get_thread_article_id(article) tid, aid = id if id in target_dictionary and tid != aid: pre_id = target_dictionary[id] pre_article = sent_dict[pre_id] return split_sentence(get_content(pre_article))[-1] else: return "" def gen(article): result = [] content = get_content(article) prev_sent = get_prev_sent(article) sentences = split_sentence(content) for i, sentence in enumerate(sentences): if i == 0: if prev_sent: result.append((prev_sent, sentence)) else: result.append((sentences[i-1], sentence)) return result pos_data = flatten([gen(article) for article in raw_corpus])[:target_size] all_sentence = flatten([split_sentence(get_content(article)) for article in raw_corpus]) neg_data = [] for i in range(target_size): a = random.choice(all_sentence) b = random.choice(all_sentence) neg_data.append((a,b)) twitter = tag.Twitter() def to_index(str): poses = twitter.pos(str, True, True) res = [idx2word.word2idx(pos[0]) for pos in poses] return res print("Pos : {} Neg : {}".format(len(pos_data), len(neg_data))) pos_data_idxed = [] for p in pos_data: a,b = p token = (to_index(a), to_index(b), 1) pos_data_idxed.append(token) neg_data_idxed = [] for p in neg_data: a,b = p token = (to_index(a), to_index(b), 1) neg_data_idxed.append(token) pickle.dump((pos_data_idxed, neg_data_idxed), open("S2Q.p", "wb"))