Esempio n. 1
0
def parse_file(pid, input_queue, output_queue):
    twitter = tag.Twitter()
    def tagger(sent):
        return [word for word, tag in twitter.pos(sent) if tag not in ['Punctuation', 'Unknown']]

    while True:
        get = input_queue.get()
        if get is None:
            output_queue.put(pid)
            break
        
        with open(get) as f:
            lines = f.readlines()
        
        sent = []
        for line in lines:
            if line == '\n' or line.find('<doc') != -1:
                continue
            
            if line.find('doc>') != -1:
                output_queue.put(' '.join(sent))
                sent = []
                continue

            line = line.strip()
            try:
                sent.append(' '.join(tagger(line)))
            except Exception as e:
                print('Exception occured : "{}"'.format(line))
                print(e)
                output_queue.put(pid)
                break
 def __init__(self, new=False):
     self.dictionary = ""
     self.model = ""
     self.lock = threading.Lock()
     self.twitter = tag.Twitter()
     self.tokenize_cache = dict()
     if not new:
         self.load_from_file(LDA_PICKLE_PATH)
     self.topic_cache = dict()
Esempio n. 3
0
 def __init__(self, tagger, useful_tags, delimiters, stopwords, **kwargs):
     if tagger == 'twitter':
         self.tagger = taggers.Twitter()
         self.tagger_options = {
             'norm': bool(kwargs.get('norm', True)),
             'stem': bool(kwargs.get('stem', True)),
         }
     else:
         raise kwchatbotError("available tagger is: twitter")
     self.useful_tags = useful_tags
     self.delimiters = delimiters
     self.stopwords = stopwords
     self.splitter = self.splitterer()
     self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
Esempio n. 4
0
    def __init__(self, tagger, useful_tags, delimiters, min_token_length,
                 stopwords, **kwargs):
        if tagger == 'twitter':
            self.tagger = taggers.Twitter()
            self.tagger_options = {
                'norm': bool(kwargs.get('norm', True)),
                'stem': bool(kwargs.get('stem', True)),
            }
        elif tagger == 'komoran':
            self.tagger = taggers.Komoran()
            self.tagger_options = {
                'flatten': bool(kwargs.get('flatten', True)),
            }
        elif tagger == 'hannanum':
            self.tagger = taggers.Hannanum()
            self.tagger_options = {
                'ntags': int(kwargs.get('ntags', 9)),
                'flatten': bool(kwargs.get('flatten', True)),
            }
        elif tagger == 'kkma':
            self.tagger = taggers.Kkma()
            self.tagger_options = {
                'flatten': bool(kwargs.get('flatten', True)),
            }
        # 윈도에선 mecab을 지원하지 않으니 이하 생략하도록 한다.

        # elif tagger == 'mecab':
        #    self.tagger = taggers.Mecab()
        #    self.tagger_options = {
        #        'flatten': bool(kwargs.get('flatten', True)),
        #    }
        else:
            raise LexRankError(
                "available taggers are: twitter, komoran, hannanum, kkma")
        self.useful_tags = useful_tags
        self.delimiters = delimiters
        self.stopwords = stopwords
        self.min_token_length = min_token_length
        self.splitter = self.splitterer()
        self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
def S2ECorpus(raw_corpus_path, entity_dict, save_path, idx2word):
    # 1. Load all documents
    doc = load_csv_euc_kr(raw_corpus_path)

    # [ int->List[String] ]
    e2s_dict = dict()

    def push(entity_id, sent):
        if entity_id not in e2s_dict:
            e2s_dict[entity_id] = list()
        e2s_dict[entity_id].append(sent)

    all_sentence = []
    # 2. Generate Entity - Sentence Dictionary
    #    Remove entity from the sentence
    print("Iterating docs...")
    bar = ListProgress(doc)
    for article in doc:
        bar.step()
        content = article[IDX_TITLE] + "\n" + article[IDX_CONTENT]
        for sentence in split_sentence(content):
            matching_entitys = entity_dict.extract_from(sentence)
            for entity in matching_entitys:
                sent_removed = remove_keyword(sentence, entity)
                entity_id = entity_dict.get_group(entity)
                push(entity_id, sent_removed)
                all_sentence.append(sent_removed)

            if len(matching_entitys) == 0:
                push(0, sentence)
                all_sentence.append(sentence)

    def pick_random_sentence():
        return random.choice(all_sentence)

    twitter = tag.Twitter()

    def to_index(str):
        poses = twitter.pos(str, True, True)
        res = [idx2word.word2idx(pos[0]) for pos in poses]
        return res

    # Corpus Gen
    #
    # 3. Sentence / Entity / Label
    print("Generating corpus...")
    corpus = []
    bar = ListProgress(e2s_dict.keys())
    for key in e2s_dict.keys():
        bar.step()
        if key == 0:
            continue
        related_sentences = e2s_dict[key]
        # Positive Corpus
        for sent in related_sentences:
            corpus.append((key, to_index(sent), 1))
        # print((entity_dict.get_name(key),sent,1))
        # Negative Corpus
        for i in range(len(related_sentences)):
            sent = pick_random_sentence()
            while sent in related_sentences:
                sent = pick_random_sentence()
            corpus.append((key, to_index(sent), 0))
            #print((entity_dict.get_name(key), sent, 0))

    print("Saving pickle...")
    pickle.dump(corpus, open(save_path, "wb"))
def gen_corpus():
    idx2word = Idx2Word("data\\idx2word")
    entity_dict = EntityDict("..\\input\\EntityDict.txt")
    target_size = 10000
    raw_corpus = load_csv_euc_kr("..\\input\\bobae_car_euc.csv")[:target_size * 10 ]

    target_dictionary = pickle.load(open("data\\targetDict.p","rb"))



    def get_thread_article_id(post):
        return post[IDX_THREAD_ID], post[IDX_ARTICLE_ID]
    def get_content(article):
        return article[IDX_TITLE] + "\n" + article[IDX_CONTENT]

    sent_dict = dict()
    for sent in raw_corpus:
        sent_dict[get_thread_article_id(sent)] = sent


    def get_prev_sent(article):
        id = get_thread_article_id(article)
        tid, aid = id
        if id in target_dictionary and tid != aid:
            pre_id = target_dictionary[id]
            pre_article = sent_dict[pre_id]
            return split_sentence(get_content(pre_article))[-1]
        else:
            return ""

    def gen(article):
        result = []
        content = get_content(article)
        prev_sent = get_prev_sent(article)

        sentences = split_sentence(content)
        for i, sentence in enumerate(sentences):
            if i == 0:
                if prev_sent:
                    result.append((prev_sent, sentence))
            else:
                result.append((sentences[i-1], sentence))
        return result

    pos_data = flatten([gen(article) for article in raw_corpus])[:target_size]

    all_sentence = flatten([split_sentence(get_content(article)) for article in raw_corpus])

    neg_data = []
    for i in range(target_size):
        a = random.choice(all_sentence)
        b = random.choice(all_sentence)
        neg_data.append((a,b))

    twitter = tag.Twitter()

    def to_index(str):
        poses = twitter.pos(str, True, True)
        res = [idx2word.word2idx(pos[0]) for pos in poses]
        return res

    print("Pos : {} Neg : {}".format(len(pos_data), len(neg_data)))

    pos_data_idxed = []
    for p in pos_data:
        a,b = p
        token = (to_index(a), to_index(b), 1)
        pos_data_idxed.append(token)

    neg_data_idxed = []
    for p in neg_data:
        a,b = p
        token = (to_index(a), to_index(b), 1)
        neg_data_idxed.append(token)


    pickle.dump((pos_data_idxed, neg_data_idxed), open("S2Q.p", "wb"))