def test_add_too_big(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=12) c = Corpus('/tmp/TEST_CORPUS') with self.assertRaises(Corpus.ExceptionTooBig): c.add(u'12345', 1) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_add_get_duplicate(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') with self.assertRaises(Corpus.ExceptionDuplicate): c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_create(self): Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name") c = Corpus('/tmp/TEST_CORPUS') self.assertEqual(c.get_property('name'), u'Fancy name') self.assertEqual(c.get_property('current_chunk'), 0) self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CONFIG_FILE))) self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CHUNK_PREFIX + '0'))) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_getitem(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą']) self.assertEqual(c.get(2), c[2]) del c shutil.rmtree('/tmp/TEST_CORPUS')
def __init__(self, corpus_name, path, deterministic=False): self._path = path self._deterministic = deterministic self._filenames = {} print "Searching %s/train/*/*" % self._path self._filenames[True] = glob("%s/train/*/*" % self._path) self._filenames[False] = glob("%s/test/*/*" % self._path) Corpus.__init__(self, corpus_name)
def test_save_config(self): Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name") c = Corpus('/tmp/TEST_CORPUS') c.set_property('name', u"Not fancy") c.save_config() d = Corpus('/tmp/TEST_CORPUS') self.assertEqual(d.get_property('name'), u"Not fancy") del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def test_len(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą']) self.assertEqual(len(c), 3) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_save_load(): c, docs = mock_corpus() fd, filename = tempfile.mkstemp() dict_fd, dict_filename = tempfile.mkstemp() metadata_fd, metadata_filename = tempfile.mkstemp() try: f = None dict_f = None try: f = os.fdopen(fd, 'wb') dict_f = os.fdopen(dict_fd, 'wb') c.save(documents_file=f, dictionary_file=dict_f, metadata_filename=metadata_filename) finally: if f is not None: f.close() if dict_f is not None: dict_f.close() new_c = Corpus.load( documents_file=filename, dictionary_file=dict_filename, metadata_filename=metadata_filename) assert_equals(c.documents, new_c.documents) assert_true(all(c.metadata == new_c.metadata)) assert_equals(c.dic, new_c.dic) finally: os.remove(filename) os.remove(dict_filename)
def test_make_new_chunk(self): Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name") c = Corpus('/tmp/TEST_CORPUS') c.make_new_chunk() d = Corpus('/tmp/TEST_CORPUS') self.assertEqual(d.get_property('current_chunk'), 1) self.assertTrue( os.path.isfile( os.path.join('/tmp/TEST_CORPUS/', Corpus.CHUNK_PREFIX + '1'))) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def test_test_chunk_size(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=10) c = Corpus('/tmp/TEST_CORPUS') self.assertTrue(c.test_chunk_size(5)) self.assertTrue(c.test_chunk_size(10)) with self.assertRaises(Corpus.ExceptionTooBig): c.test_chunk_size(11) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_save_load_dictionary(): c, docs = mock_corpus() dict_fd, dict_filename = tempfile.mkstemp() try: dict_f = None try: dict_f = os.fdopen(dict_fd, 'wb') c.save_dictionary(dict_f) finally: if dict_f is not None: dict_f.close() new_c = Corpus() new_c.load_dictionary(dict_filename) assert_equals(c.dic, new_c.dic) finally: os.remove(dict_filename)
def test_chunking(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=13) c = Corpus('/tmp/TEST_CORPUS') c.add(u'12345', 1) c.add(u'12345', 2) (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2)) self.assertEqual(chunk_number, 1) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_make_new_chunk(self): Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name") c = Corpus('/tmp/TEST_CORPUS') c.make_new_chunk() d = Corpus('/tmp/TEST_CORPUS') self.assertEqual(d.get_property('current_chunk'), 1) self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CHUNK_PREFIX + '1'))) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def coordinates_input(t): try: path, source_corpus, target_corpus = t.split(',') except: raise argparse.ArgumentTypeError( 'Coordinates input must be coordinates_path,source_corpus,target_corpus' ) try: source_corpus = Corpus.argparse(source_corpus) except: raise argparse.ArgumentTypeError( f'{source_corpus} is not a valid value for enum Corpus') try: target_corpus = Corpus.argparse(target_corpus) except: raise argparse.ArgumentTypeError( f'{target_corpus} is not a valid value for enum Corpus') return path, source_corpus, target_corpus
def test_add_get_duplicate(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') with self.assertRaises(Corpus.ExceptionDuplicate): c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) del c shutil.rmtree('/tmp/TEST_CORPUS')
def test_create(self): Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name") c = Corpus('/tmp/TEST_CORPUS') self.assertEqual(c.get_property('name'), u'Fancy name') self.assertEqual(c.get_property('current_chunk'), 0) self.assertTrue( os.path.isfile( os.path.join('/tmp/TEST_CORPUS/', Corpus.CONFIG_FILE))) self.assertTrue( os.path.isfile( os.path.join('/tmp/TEST_CORPUS/', Corpus.CHUNK_PREFIX + '0'))) del c shutil.rmtree('/tmp/TEST_CORPUS')
def generateVisualization(self, guid): """ Given a guid, it prepares its explainer document """ ## output_dir=Corpus.dir_output output_dir=r"C:\Users\dd\Documents\Dropbox\PhD\code\doc_visualization\\" doc=Corpus.loadSciDoc(guid) Corpus.tagAllReferencesAsInCollectionOrNot(doc) counts1=self.getDocumentTokens(doc) # generate a unique id for each unique term, make a dictionary for index, token in enumerate(counts1): self.term_info[token]={"token_id":str(index), "references": []} self.overlapping_tokens={} in_collection_references=Corpus.getMetadataByGUID(guid)["outlinks"] for ref in doc["references"]: match=Corpus.matchReferenceInIndex(ref) if match: doc2=Corpus.loadSciDoc(match["guid"]) counts2=self.getDocumentTokens(doc2) # for each in_collection_reference number (0 onwards) we store the list # of its overlapping tokens with the current document self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2) for token in self.overlapping_tokens[ref["id"]]: ref_list=self.term_info[token]["references"] if ref["id"] not in ref_list: ref_list.append(ref["id"]) # try to find some signal in the noise self.filterTokens() json_str="var token_data=" + json.dumps(self.overlapping_tokens) + ";" writeFileText(json_str, output_dir+guid+"_data.json") html=doc.prettyPrintDocumentHTML( True, True, False, ## extra_attribute_function=self.extraAttributes, citation_formatting_function=self.citationFormatting, reference_formatting_function=self.referenceFormatting, text_formatting_function=self.textFormatting ) html=self.padWithHTML(html, guid) writeFileText(html,output_dir+guid+"_vis.html")
def main(): ## explainZoning("P95-1026") docs=Corpus.listPapers("num_in_collection_references > 10 order by num_in_collection_references desc") generator=VisGenerator() generator.generateVisualization(docs[0])
def test_add_get(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') self.assertEqual(d.get(3), ({ 'p1': 1, 'p2': "2", 'p3': [1, 2, 3, u'ą'], 'id': 3 }, u'Żółte źrebie')) self.assertEqual(d.get(1), ({ 'p1': 1, 'p2': "2", 'p3': [1, 2, 3, u'ą'], 'id': 1 }, u'Gżegżółką jaźń')) self.assertEqual(d.get(2), ({ 'p1': 1, 'p2': "2", 'p3': [1, 2, 3, u'ą'], 'id': 2 }, u'Chrząszcz brzmi w czcinie')) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def __init__(self, name = "wiki"): Corpus.__init__(self, name)
def test_iter(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') l = [] for t in d: l.append(t[0]['id']) self.assertEqual(l, [3, 1, 2]) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
def __init__(self, name="wiki"): Corpus.__init__(self, name)
""" """ from corpora import Corpus from nltk.corpus import PlaintextCorpusReader import csv corpus_path = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/test' Corpus.create(corpus_path) corpus = Corpus(corpus_path) training_file_path = "/home/mayank/IdeaProjects/Lab_Machine_Learning/src/resources/TrainingData.csv" reader = csv.reader(open(training_file_path, 'r')) for (i, row) in enumerate(reader, 1): print i corpus.add(row[6].decode('utf-8'), i) if i == 10: break print len(corpus) print corpus.get()
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from corpora import Corpus if __name__ == "__main__": parser = argparse.ArgumentParser( description="load files of scala file and convert it to a python " "corpus using a dictionary" ) parser.add_argument("scala_file", help="python pickle file, containing tokens and metadata") parser.add_argument("dictionary") parser.add_argument("corpus") args = parser.parse_args() print("loading scala_file") corpus = Corpus.load(scala_file=args.scala_file, dictionary_file=args.dictionary) print("writing corpus to file") corpus.save(documents_file=args.corpus)
def test_get_chunk(self): Corpus.create('/tmp/TEST_CORPUS', chunk_size=10) c = Corpus('/tmp/TEST_CORPUS') self.assertIsNotNone(c.get_chunk()) del c shutil.rmtree('/tmp/TEST_CORPUS')
from corpora import Corpus import logging from logging.config import fileConfig # Setup logging. # fileConfig('log_config.ini') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) hdlr = logging.FileHandler('./similarity.log') log = logging.getLogger() log.addHandler(hdlr) log.info('Start!') corpuss = Corpus() # corpuss.tokenize_bessarabia() corpuss.tokenize_romania() big_corpus = corpuss.tokenized_romania #+ corpuss.tokenized_romania # log.info('Big corpus: %s' % big_corpus) # remove common words and tokenize stoplist = set('această intru preste față făcut foarte fostu nóstre despre sale dara anulu inse alta cele sunt fara prin dupa cari aceasta sînt fără toate între după acii '\ 'cãtre decît suntu dein loru dela numai voru catu totu suntu acésta celu inca pndia pana acésta' \ 'tóte carea acesta tote candu intre dectu multu pote acestu nici tóte fost póte'.split()) log.info(stoplist) texts = [[ word for word in document.lower().split() if ((len(word) > 3) and (word.encode('utf-8') not in stoplist)) ] for document in big_corpus]
def test_add_get(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') self.assertEqual(d.get(3), ( { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':3}, u'Żółte źrebie' ) ) self.assertEqual(d.get(1), ( { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':1}, u'Gżegżółką jaźń' ) ) self.assertEqual(d.get(2), ( { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':2}, u'Chrząszcz brzmi w czcinie' ) ) del c, d shutil.rmtree('/tmp/TEST_CORPUS')
import logging from logging.config import fileConfig # Setup logging. # fileConfig('log_config.ini') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) hdlr = logging.FileHandler('./similarity.log') log = logging.getLogger() log.addHandler(hdlr) log.info('Start!') corpuss = Corpus() # corpuss.tokenize_bessarabia() corpuss.tokenize_romania() big_corpus = corpuss.tokenized_romania #+ corpuss.tokenized_romania # log.info('Big corpus: %s' % big_corpus) # remove common words and tokenize stoplist = set('această intru preste față făcut foarte fostu nóstre despre sale dara anulu inse alta cele sunt fara prin dupa cari aceasta sînt fără toate între după acii '\ 'cãtre decît suntu dein loru dela numai voru catu totu suntu acésta celu inca pndia pana acésta' \ 'tóte carea acesta tote candu intre dectu multu pote acestu nici tóte fost póte'.split()) log.info(stoplist) texts = [[word for word in document.lower().split() if ((len(word) > 3) and (word.encode('utf-8') not in stoplist))] for document in big_corpus] # remove words that appear only once
def test_iter(self): Corpus.create('/tmp/TEST_CORPUS') c = Corpus('/tmp/TEST_CORPUS') c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Chrząszcz brzmi w czcinie', 1, p1=1, p2="2", p3=[1,2,3,u'ą']) c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1,2,3,u'ą']) c.save_indexes() d = Corpus('/tmp/TEST_CORPUS') l = [] for t in d: l.append(t[0]['id']) self.assertEqual(l, [3,1,2]) del c, d shutil.rmtree('/tmp/TEST_CORPUS')