コード例 #1
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_add_too_big(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=12)
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.add(u'12345', 1)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')           
コード例 #2
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_add_get_duplicate(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionDuplicate):
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #3
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_create(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(c.get_property('name'), u'Fancy name')
     self.assertEqual(c.get_property('current_chunk'), 0)
     self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CONFIG_FILE)))
     self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CHUNK_PREFIX + '0')))
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #4
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_getitem(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])            
     self.assertEqual(c.get(2), c[2])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')        
コード例 #5
0
    def __init__(self, corpus_name, path, deterministic=False):
        self._path = path
        self._deterministic = deterministic

        self._filenames = {}
        print "Searching %s/train/*/*" % self._path
        self._filenames[True] = glob("%s/train/*/*" % self._path)
        self._filenames[False] = glob("%s/test/*/*" % self._path)

        Corpus.__init__(self, corpus_name)
コード例 #6
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
  def __init__(self, corpus_name, path, deterministic=False):
    self._path = path
    self._deterministic = deterministic

    self._filenames = {}
    print "Searching %s/train/*/*" % self._path
    self._filenames[True] = glob("%s/train/*/*" % self._path)
    self._filenames[False] = glob("%s/test/*/*" % self._path)

    Corpus.__init__(self, corpus_name)
コード例 #7
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_save_config(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')    
     c.set_property('name', u"Not fancy")
     c.save_config()
     d = Corpus('/tmp/TEST_CORPUS')  
     self.assertEqual(d.get_property('name'), u"Not fancy")
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #8
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_len(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           2,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     self.assertEqual(len(c), 3)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #9
0
def test_save_load():
    c, docs = mock_corpus()
    fd, filename = tempfile.mkstemp()
    dict_fd, dict_filename = tempfile.mkstemp()
    metadata_fd, metadata_filename = tempfile.mkstemp()
    try:
        f = None
        dict_f = None
        try:
            f = os.fdopen(fd, 'wb')
            dict_f = os.fdopen(dict_fd, 'wb')
            c.save(documents_file=f, dictionary_file=dict_f,
                   metadata_filename=metadata_filename)
        finally:
            if f is not None:
                f.close()
            if dict_f is not None:
                dict_f.close()

        new_c = Corpus.load(
            documents_file=filename,
            dictionary_file=dict_filename,
            metadata_filename=metadata_filename)
        assert_equals(c.documents, new_c.documents)
        assert_true(all(c.metadata == new_c.metadata))
        assert_equals(c.dic, new_c.dic)
    finally:
        os.remove(filename)
        os.remove(dict_filename)
コード例 #10
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
    def test_make_new_chunk(self):
        Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
        c = Corpus('/tmp/TEST_CORPUS')
        c.make_new_chunk()
        d = Corpus('/tmp/TEST_CORPUS')

        self.assertEqual(d.get_property('current_chunk'), 1)
        self.assertTrue(
            os.path.isfile(
                os.path.join('/tmp/TEST_CORPUS/', Corpus.CHUNK_PREFIX + '1')))

        del c, d
        shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #11
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
    def test_test_chunk_size(self):
        Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
        c = Corpus('/tmp/TEST_CORPUS')

        self.assertTrue(c.test_chunk_size(5))
        self.assertTrue(c.test_chunk_size(10))
        with self.assertRaises(Corpus.ExceptionTooBig):
            c.test_chunk_size(11)
        del c
        shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #12
0
def test_save_load_dictionary():
    c, docs = mock_corpus()
    dict_fd, dict_filename = tempfile.mkstemp()
    try:
        dict_f = None
        try:
            dict_f = os.fdopen(dict_fd, 'wb')
            c.save_dictionary(dict_f)
        finally:
            if dict_f is not None:
                dict_f.close()

        new_c = Corpus()
        new_c.load_dictionary(dict_filename)
        assert_equals(c.dic, new_c.dic)

    finally:
        os.remove(dict_filename)
コード例 #13
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
    def test_chunking(self):
        Corpus.create('/tmp/TEST_CORPUS', chunk_size=13)
        c = Corpus('/tmp/TEST_CORPUS')
        c.add(u'12345', 1)
        c.add(u'12345', 2)

        (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2))
        self.assertEqual(chunk_number, 1)

        del c
        shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #14
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_test_chunk_size(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
     c = Corpus('/tmp/TEST_CORPUS')
     
     self.assertTrue(c.test_chunk_size(5))
     self.assertTrue(c.test_chunk_size(10))
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.test_chunk_size(11)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #15
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_add_too_big(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=12)
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.add(u'12345', 1)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #16
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_chunking(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=13)
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'12345', 1)
     c.add(u'12345', 2)
     
     (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2))
     self.assertEqual(chunk_number, 1)
     
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')           
コード例 #17
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_make_new_chunk(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     c.make_new_chunk()
     d = Corpus('/tmp/TEST_CORPUS')
     
     self.assertEqual(d.get_property('current_chunk'), 1)
     self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CHUNK_PREFIX + '1')))
     
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #18
0
ファイル: 5_word_fire.py プロジェクト: SapienzaNLP/mulan
    def coordinates_input(t):

        try:
            path, source_corpus, target_corpus = t.split(',')
        except:
            raise argparse.ArgumentTypeError(
                'Coordinates input must be coordinates_path,source_corpus,target_corpus'
            )

        try:
            source_corpus = Corpus.argparse(source_corpus)
        except:
            raise argparse.ArgumentTypeError(
                f'{source_corpus} is not a valid value for enum Corpus')

        try:
            target_corpus = Corpus.argparse(target_corpus)
        except:
            raise argparse.ArgumentTypeError(
                f'{target_corpus} is not a valid value for enum Corpus')

        return path, source_corpus, target_corpus
コード例 #19
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_add_get_duplicate(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionDuplicate):
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #20
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_create(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(c.get_property('name'), u'Fancy name')
     self.assertEqual(c.get_property('current_chunk'), 0)
     self.assertTrue(
         os.path.isfile(
             os.path.join('/tmp/TEST_CORPUS/', Corpus.CONFIG_FILE)))
     self.assertTrue(
         os.path.isfile(
             os.path.join('/tmp/TEST_CORPUS/', Corpus.CHUNK_PREFIX + '0')))
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #21
0
    def generateVisualization(self, guid):
        """
            Given a guid, it prepares its explainer document
        """
##        output_dir=Corpus.dir_output
        output_dir=r"C:\Users\dd\Documents\Dropbox\PhD\code\doc_visualization\\"

        doc=Corpus.loadSciDoc(guid)
        Corpus.tagAllReferencesAsInCollectionOrNot(doc)
        counts1=self.getDocumentTokens(doc)

        # generate a unique id for each unique term, make a dictionary
        for index, token in enumerate(counts1):
            self.term_info[token]={"token_id":str(index), "references": []}

        self.overlapping_tokens={}

        in_collection_references=Corpus.getMetadataByGUID(guid)["outlinks"]
        for ref in doc["references"]:
            match=Corpus.matchReferenceInIndex(ref)
            if match:
                doc2=Corpus.loadSciDoc(match["guid"])
                counts2=self.getDocumentTokens(doc2)
                # for each in_collection_reference number (0 onwards) we store the list
                # of its overlapping tokens with the current document

                self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2)

                for token in self.overlapping_tokens[ref["id"]]:
                    ref_list=self.term_info[token]["references"]
                    if ref["id"] not in ref_list:
                        ref_list.append(ref["id"])

        # try to find some signal in the noise
        self.filterTokens()

        json_str="var token_data=" + json.dumps(self.overlapping_tokens) + ";"
        writeFileText(json_str, output_dir+guid+"_data.json")

        html=doc.prettyPrintDocumentHTML(
            True,
            True,
            False,
##            extra_attribute_function=self.extraAttributes,
            citation_formatting_function=self.citationFormatting,
            reference_formatting_function=self.referenceFormatting,
            text_formatting_function=self.textFormatting
            )
        html=self.padWithHTML(html, guid)
        writeFileText(html,output_dir+guid+"_vis.html")
コード例 #22
0
    def generateVisualization(self, guid):
        """
            Given a guid, it prepares its explainer document
        """
##        output_dir=Corpus.dir_output
        output_dir=r"C:\Users\dd\Documents\Dropbox\PhD\code\doc_visualization\\"

        doc=Corpus.loadSciDoc(guid)
        Corpus.tagAllReferencesAsInCollectionOrNot(doc)
        counts1=self.getDocumentTokens(doc)

        # generate a unique id for each unique term, make a dictionary
        for index, token in enumerate(counts1):
            self.term_info[token]={"token_id":str(index), "references": []}

        self.overlapping_tokens={}

        in_collection_references=Corpus.getMetadataByGUID(guid)["outlinks"]
        for ref in doc["references"]:
            match=Corpus.matchReferenceInIndex(ref)
            if match:
                doc2=Corpus.loadSciDoc(match["guid"])
                counts2=self.getDocumentTokens(doc2)
                # for each in_collection_reference number (0 onwards) we store the list
                # of its overlapping tokens with the current document

                self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2)

                for token in self.overlapping_tokens[ref["id"]]:
                    ref_list=self.term_info[token]["references"]
                    if ref["id"] not in ref_list:
                        ref_list.append(ref["id"])

        # try to find some signal in the noise
        self.filterTokens()

        json_str="var token_data=" + json.dumps(self.overlapping_tokens) + ";"
        writeFileText(json_str, output_dir+guid+"_data.json")

        html=doc.prettyPrintDocumentHTML(
            True,
            True,
            False,
##            extra_attribute_function=self.extraAttributes,
            citation_formatting_function=self.citationFormatting,
            reference_formatting_function=self.referenceFormatting,
            text_formatting_function=self.textFormatting
            )
        html=self.padWithHTML(html, guid)
        writeFileText(html,output_dir+guid+"_vis.html")
コード例 #23
0
def main():
##    explainZoning("P95-1026")

    docs=Corpus.listPapers("num_in_collection_references > 10 order by num_in_collection_references desc")
    generator=VisGenerator()
    generator.generateVisualization(docs[0])
コード例 #24
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_add_get(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           2,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.save_indexes()
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get(3), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 3
     }, u'Żółte źrebie'))
     self.assertEqual(d.get(1), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 1
     }, u'Gżegżółką jaźń'))
     self.assertEqual(d.get(2), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 2
     }, u'Chrząszcz brzmi w czcinie'))
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #25
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
 def __init__(self, name = "wiki"):
   Corpus.__init__(self, name)
コード例 #26
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_iter(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           1,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.save_indexes()
     d = Corpus('/tmp/TEST_CORPUS')
     l = []
     for t in d:
         l.append(t[0]['id'])
     self.assertEqual(l, [3, 1, 2])
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #27
0
 def __init__(self, name="wiki"):
     Corpus.__init__(self, name)
コード例 #28
0
"""

"""

from corpora import Corpus
from nltk.corpus import PlaintextCorpusReader
import csv

corpus_path = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/test'
Corpus.create(corpus_path)
corpus = Corpus(corpus_path)

training_file_path = "/home/mayank/IdeaProjects/Lab_Machine_Learning/src/resources/TrainingData.csv"
reader = csv.reader(open(training_file_path, 'r'))

for (i, row) in enumerate(reader, 1):
    print i
    corpus.add(row[6].decode('utf-8'), i)
    if i == 10: break

print len(corpus)
print corpus.get()
コード例 #29
0
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from corpora import Corpus


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="load files of scala file and convert it to a python " "corpus using a dictionary"
    )
    parser.add_argument("scala_file", help="python pickle file, containing tokens and metadata")
    parser.add_argument("dictionary")
    parser.add_argument("corpus")
    args = parser.parse_args()

    print("loading scala_file")
    corpus = Corpus.load(scala_file=args.scala_file, dictionary_file=args.dictionary)

    print("writing corpus to file")
    corpus.save(documents_file=args.corpus)
コード例 #30
0
def main():
##    explainZoning("P95-1026")

    docs=Corpus.listPapers("num_in_collection_references > 10 order by num_in_collection_references desc")
    generator=VisGenerator()
    generator.generateVisualization(docs[0])
コード例 #31
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_get_chunk(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertIsNotNone(c.get_chunk())
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #32
0
ファイル: main.py プロジェクト: calincrist/doc_similarity
from corpora import Corpus

import logging
from logging.config import fileConfig

# Setup logging.
# fileConfig('log_config.ini')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
hdlr = logging.FileHandler('./similarity.log')
log = logging.getLogger()
log.addHandler(hdlr)

log.info('Start!')

corpuss = Corpus()
# corpuss.tokenize_bessarabia()
corpuss.tokenize_romania()
big_corpus = corpuss.tokenized_romania  #+ corpuss.tokenized_romania

# log.info('Big corpus: %s' % big_corpus)

# remove common words and tokenize
stoplist = set('această intru preste față făcut foarte fostu nóstre despre sale dara anulu inse alta cele sunt fara prin dupa cari aceasta sînt fără toate între după acii '\
'cãtre decît suntu dein loru dela numai voru catu totu suntu acésta celu inca pndia pana acésta' \
'tóte carea acesta tote candu intre dectu multu pote acestu nici tóte fost póte'.split())
log.info(stoplist)
texts = [[
    word for word in document.lower().split()
    if ((len(word) > 3) and (word.encode('utf-8') not in stoplist))
] for document in big_corpus]
コード例 #33
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_save_config(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     c.set_property('name', u"Not fancy")
     c.save_config()
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get_property('name'), u"Not fancy")
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #34
0
ファイル: test_corpus.py プロジェクト: Wb-Alpha/FinalExam
 def test_get_chunk(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertIsNotNone(c.get_chunk())
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
コード例 #35
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_add_get(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])  
     c.save_indexes()      
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get(3), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':3},  u'Żółte źrebie'   ) )
     self.assertEqual(d.get(1), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':1},  u'Gżegżółką jaźń'   ) )
     self.assertEqual(d.get(2), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':2},  u'Chrząszcz brzmi w czcinie'  ) )
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')         
コード例 #36
0
import logging
from logging.config import fileConfig

# Setup logging.
# fileConfig('log_config.ini')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
hdlr = logging.FileHandler('./similarity.log')
log = logging.getLogger()
log.addHandler(hdlr)

log.info('Start!')


corpuss = Corpus()
# corpuss.tokenize_bessarabia()
corpuss.tokenize_romania()
big_corpus = corpuss.tokenized_romania #+ corpuss.tokenized_romania

# log.info('Big corpus: %s' % big_corpus)

# remove common words and tokenize
stoplist = set('această intru preste față făcut foarte fostu nóstre despre sale dara anulu inse alta cele sunt fara prin dupa cari aceasta sînt fără toate între după acii '\
'cãtre decît suntu dein loru dela numai voru catu totu suntu acésta celu inca pndia pana acésta' \
'tóte carea acesta tote candu intre dectu multu pote acestu nici tóte fost póte'.split())
log.info(stoplist)
texts = [[word for word in document.lower().split() if ((len(word) > 3) and (word.encode('utf-8') not in stoplist))]
         for document in big_corpus]

# remove words that appear only once
コード例 #37
0
ファイル: test_corpus.py プロジェクト: cypreess/corpora
 def test_iter(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])  
     c.save_indexes()      
     d = Corpus('/tmp/TEST_CORPUS')
     l = []
     for t in d:
         l.append(t[0]['id'])
     self.assertEqual(l, [3,1,2])
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')