Exemple #1
0
def load_file(filename):
    corpus = []
    doc_ids = []
    event_list = []
    fname_total = '%s/%s' % (ip_dir, filename)
    #print 'fname_total : ', fname_total
    f = open(fname_total, 'rb')
    story_dic = pickle.load(f)
    f.close()
    total_no_word = 0
    for story in sorted(story_dic):
        #print(story_dic[story]['NER'].keys())
        temp_doc = []
        for item in ['PER', 'LOC', 'ORG', 'ONS']:
            temp_doc += story_dic[story]['NER']['TITLE_CONTENT'][item]
        doc_id = story.strip('.html').strip('.htm')
        event_id = '_'.join(doc_id.split('_')[:2])
        if len(temp_doc) > 0:
            corpus.append(temp_doc)
            doc_ids.append(doc_id)
            total_no_word += len(temp_doc)
            if event_id not in event_list:
                event_list.append(event_id)
    f.close()
    return corpus, doc_ids, event_list, total_no_word
Exemple #2
0
def load_file(filename):
    corpus = []
    with codecs.open(filename, encoding='utf8') as freader:
        for line in freader:
            doc = line.strip().split(',')
            corpus.append(doc)
    return corpus
def load_sentences(sentences):
    corpus = []
    for line in sentences:
        doc = re.findall(r'\w+(?:\'\w+)?',line)
        if len(doc)>0:
            corpus.append(doc)
    return corpus
Exemple #4
0
def load_dataframe_jp(documents):
    corpus = []
    tagger = MeCab.Tagger('-O wakati')
    tagger.parse("")
    for doc in documents:
        tokens = tagger.parse(doc.strip()).split()
        corpus.append(tokens)
    return corpus
Exemple #5
0
def load_dataframe(documents):
    corpus = []
    for doc in documents:
        sentences = re.findall(r'\w+(?:\'\w+)?', doc)
        if len(sentences) > 0:
            corpus.append(sentences)

    return corpus
def load_file(filename):
    corpus = []
    f = open(filename, 'r')
    for line in f:
        doc = re.findall(r'\w+(?:\'\w+)?',line)
        if len(doc)>0:
            corpus.append(doc)
    f.close()
    return corpus
def load_file(filename):
    corpus = []
    f = open(filename, 'r')
    for line in f:
        doc = re.findall(r'\w+(?:\'\w+)?', line)
        if len(doc) > 0:
            corpus.append(doc)
    f.close()
    return corpus
Exemple #8
0
def load_file(filename):
    """
    for one file
    one line corresponds to one doc
    """
    corpus = []
    f = open(filename, 'r')
    for line in f:
        doc = re.findall(r'\w+(?:\'\w+)?', line)
        if len(doc) > 0:
            corpus.append(doc)
    f.close()
    return corpus
Exemple #9
0
def load_file_reuter(filename):
    corpus = []
    doc_ids = []
    event_list = []
    fname_total = '%s/%s' % (ip_dir, filename)
    #print 'fname_total : ', fname_total
    f = open(fname_total, 'rb')
    story_dic = pickle.load(f)
    f.close()
    total_no_word = 0
    for story in sorted(story_dic):
        event_id = story_dic[story]['topic']
        corpus.append(story_dic[story]['content'])
        doc_ids.append(story)
        if event_id not in event_list:
            event_list.append(event_id)

    f.close()
    return corpus, doc_ids, event_list, total_no_word
Exemple #10
0
def load_file(filename):
    corpus = []
    doc_ids = []
    event_list = []
    f = open(filename, 'r')
    story_dic = pickle.load(f)
    f.close()
    for story in sorted(story_dic):
        title_content = story_dic[story]['CONTENT'] + story_dic[story]['TITLE']
        doc = re.findall(r'\w+(?:\'\w+)?', title_content)  # tokenizing here
        doc_id = story.strip('.html').strip('.htm')
        event_id = '_'.join(doc_id.split('_')[:2])
        #print 'doc in load file ' , doc
        if len(doc) > 0:
            corpus.append(doc)
            doc_ids.append(doc_id)
            if event_id not in event_list:
                event_list.append(event_id)
    f.close()
    return corpus, doc_ids, event_list
Exemple #11
0
def load_file(filename, format=False):
    corpus = []
    if format == False:
        f = open(filename, 'r')
        for line in f:
            doc = re.findall(r'\w+(?:\'\w+)?',line)
            if len(doc)>0:
                corpus.append(doc)
        f.close()
        return corpus
    else:
        f = codecs.open(filename, 'r', 'utf-8')
        # f = open(filename, 'r')
        lines = f.readlines()
        i = 0
        pre_doc = ""
        for line in lines:
            # doc = re.findall(r'\w+(?:\'\w+)?',line)
            # doc = line.encode("utf-8").split(" ")
            doc = re.findall(r'\w+(?:\'\w+)?',line.encode("utf-8"))
            if len(doc)>0:
                corpus.append(doc)
                pre_doc = doc
            else:
                corpus.append(pre_doc)
        f.close()
        return corpus