Example #1
0
 def test_add_too_big(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=12)
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.add(u'12345', 1)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')           
Example #2
0
 def test_add_get_duplicate(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionDuplicate):
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #3
0
 def test_create(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(c.get_property('name'), u'Fancy name')
     self.assertEqual(c.get_property('current_chunk'), 0)
     self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CONFIG_FILE)))
     self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CHUNK_PREFIX + '0')))
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #4
0
 def test_getitem(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])            
     self.assertEqual(c.get(2), c[2])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')        
Example #5
0
    def __init__(self, corpus_name, path, deterministic=False):
        self._path = path
        self._deterministic = deterministic

        self._filenames = {}
        print "Searching %s/train/*/*" % self._path
        self._filenames[True] = glob("%s/train/*/*" % self._path)
        self._filenames[False] = glob("%s/test/*/*" % self._path)

        Corpus.__init__(self, corpus_name)
Example #6
0
  def __init__(self, corpus_name, path, deterministic=False):
    self._path = path
    self._deterministic = deterministic

    self._filenames = {}
    print "Searching %s/train/*/*" % self._path
    self._filenames[True] = glob("%s/train/*/*" % self._path)
    self._filenames[False] = glob("%s/test/*/*" % self._path)

    Corpus.__init__(self, corpus_name)
Example #7
0
 def test_save_config(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')    
     c.set_property('name', u"Not fancy")
     c.save_config()
     d = Corpus('/tmp/TEST_CORPUS')  
     self.assertEqual(d.get_property('name'), u"Not fancy")
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #8
0
 def test_len(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           2,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     self.assertEqual(len(c), 3)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
def test_save_load():
    c, docs = mock_corpus()
    fd, filename = tempfile.mkstemp()
    dict_fd, dict_filename = tempfile.mkstemp()
    metadata_fd, metadata_filename = tempfile.mkstemp()
    try:
        f = None
        dict_f = None
        try:
            f = os.fdopen(fd, 'wb')
            dict_f = os.fdopen(dict_fd, 'wb')
            c.save(documents_file=f, dictionary_file=dict_f,
                   metadata_filename=metadata_filename)
        finally:
            if f is not None:
                f.close()
            if dict_f is not None:
                dict_f.close()

        new_c = Corpus.load(
            documents_file=filename,
            dictionary_file=dict_filename,
            metadata_filename=metadata_filename)
        assert_equals(c.documents, new_c.documents)
        assert_true(all(c.metadata == new_c.metadata))
        assert_equals(c.dic, new_c.dic)
    finally:
        os.remove(filename)
        os.remove(dict_filename)
Example #10
0
    def test_make_new_chunk(self):
        Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
        c = Corpus('/tmp/TEST_CORPUS')
        c.make_new_chunk()
        d = Corpus('/tmp/TEST_CORPUS')

        self.assertEqual(d.get_property('current_chunk'), 1)
        self.assertTrue(
            os.path.isfile(
                os.path.join('/tmp/TEST_CORPUS/', Corpus.CHUNK_PREFIX + '1')))

        del c, d
        shutil.rmtree('/tmp/TEST_CORPUS')
Example #11
0
    def test_test_chunk_size(self):
        Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
        c = Corpus('/tmp/TEST_CORPUS')

        self.assertTrue(c.test_chunk_size(5))
        self.assertTrue(c.test_chunk_size(10))
        with self.assertRaises(Corpus.ExceptionTooBig):
            c.test_chunk_size(11)
        del c
        shutil.rmtree('/tmp/TEST_CORPUS')
def test_save_load_dictionary():
    c, docs = mock_corpus()
    dict_fd, dict_filename = tempfile.mkstemp()
    try:
        dict_f = None
        try:
            dict_f = os.fdopen(dict_fd, 'wb')
            c.save_dictionary(dict_f)
        finally:
            if dict_f is not None:
                dict_f.close()

        new_c = Corpus()
        new_c.load_dictionary(dict_filename)
        assert_equals(c.dic, new_c.dic)

    finally:
        os.remove(dict_filename)
Example #13
0
    def test_chunking(self):
        Corpus.create('/tmp/TEST_CORPUS', chunk_size=13)
        c = Corpus('/tmp/TEST_CORPUS')
        c.add(u'12345', 1)
        c.add(u'12345', 2)

        (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2))
        self.assertEqual(chunk_number, 1)

        del c
        shutil.rmtree('/tmp/TEST_CORPUS')
Example #14
0
 def test_test_chunk_size(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
     c = Corpus('/tmp/TEST_CORPUS')
     
     self.assertTrue(c.test_chunk_size(5))
     self.assertTrue(c.test_chunk_size(10))
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.test_chunk_size(11)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #15
0
 def test_add_too_big(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=12)
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionTooBig):
         c.add(u'12345', 1)
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #16
0
 def test_chunking(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=13)
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'12345', 1)
     c.add(u'12345', 2)
     
     (chunk_number, offset, head_len, text_len) = c.get_idx(c.get_ridx(2))
     self.assertEqual(chunk_number, 1)
     
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')           
Example #17
0
 def test_make_new_chunk(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     c.make_new_chunk()
     d = Corpus('/tmp/TEST_CORPUS')
     
     self.assertEqual(d.get_property('current_chunk'), 1)
     self.assertTrue(os.path.isfile(os.path.join('/tmp/TEST_CORPUS/' , Corpus.CHUNK_PREFIX + '1')))
     
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #18
0
    def coordinates_input(t):

        try:
            path, source_corpus, target_corpus = t.split(',')
        except:
            raise argparse.ArgumentTypeError(
                'Coordinates input must be coordinates_path,source_corpus,target_corpus'
            )

        try:
            source_corpus = Corpus.argparse(source_corpus)
        except:
            raise argparse.ArgumentTypeError(
                f'{source_corpus} is not a valid value for enum Corpus')

        try:
            target_corpus = Corpus.argparse(target_corpus)
        except:
            raise argparse.ArgumentTypeError(
                f'{target_corpus} is not a valid value for enum Corpus')

        return path, source_corpus, target_corpus
Example #19
0
 def test_add_get_duplicate(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     with self.assertRaises(Corpus.ExceptionDuplicate):
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
         c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #20
0
 def test_create(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(c.get_property('name'), u'Fancy name')
     self.assertEqual(c.get_property('current_chunk'), 0)
     self.assertTrue(
         os.path.isfile(
             os.path.join('/tmp/TEST_CORPUS/', Corpus.CONFIG_FILE)))
     self.assertTrue(
         os.path.isfile(
             os.path.join('/tmp/TEST_CORPUS/', Corpus.CHUNK_PREFIX + '0')))
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
    def generateVisualization(self, guid):
        """
            Given a guid, it prepares its explainer document
        """
##        output_dir=Corpus.dir_output
        output_dir=r"C:\Users\dd\Documents\Dropbox\PhD\code\doc_visualization\\"

        doc=Corpus.loadSciDoc(guid)
        Corpus.tagAllReferencesAsInCollectionOrNot(doc)
        counts1=self.getDocumentTokens(doc)

        # generate a unique id for each unique term, make a dictionary
        for index, token in enumerate(counts1):
            self.term_info[token]={"token_id":str(index), "references": []}

        self.overlapping_tokens={}

        in_collection_references=Corpus.getMetadataByGUID(guid)["outlinks"]
        for ref in doc["references"]:
            match=Corpus.matchReferenceInIndex(ref)
            if match:
                doc2=Corpus.loadSciDoc(match["guid"])
                counts2=self.getDocumentTokens(doc2)
                # for each in_collection_reference number (0 onwards) we store the list
                # of its overlapping tokens with the current document

                self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2)

                for token in self.overlapping_tokens[ref["id"]]:
                    ref_list=self.term_info[token]["references"]
                    if ref["id"] not in ref_list:
                        ref_list.append(ref["id"])

        # try to find some signal in the noise
        self.filterTokens()

        json_str="var token_data=" + json.dumps(self.overlapping_tokens) + ";"
        writeFileText(json_str, output_dir+guid+"_data.json")

        html=doc.prettyPrintDocumentHTML(
            True,
            True,
            False,
##            extra_attribute_function=self.extraAttributes,
            citation_formatting_function=self.citationFormatting,
            reference_formatting_function=self.referenceFormatting,
            text_formatting_function=self.textFormatting
            )
        html=self.padWithHTML(html, guid)
        writeFileText(html,output_dir+guid+"_vis.html")
Example #22
0
    def generateVisualization(self, guid):
        """
            Given a guid, it prepares its explainer document
        """
##        output_dir=Corpus.dir_output
        output_dir=r"C:\Users\dd\Documents\Dropbox\PhD\code\doc_visualization\\"

        doc=Corpus.loadSciDoc(guid)
        Corpus.tagAllReferencesAsInCollectionOrNot(doc)
        counts1=self.getDocumentTokens(doc)

        # generate a unique id for each unique term, make a dictionary
        for index, token in enumerate(counts1):
            self.term_info[token]={"token_id":str(index), "references": []}

        self.overlapping_tokens={}

        in_collection_references=Corpus.getMetadataByGUID(guid)["outlinks"]
        for ref in doc["references"]:
            match=Corpus.matchReferenceInIndex(ref)
            if match:
                doc2=Corpus.loadSciDoc(match["guid"])
                counts2=self.getDocumentTokens(doc2)
                # for each in_collection_reference number (0 onwards) we store the list
                # of its overlapping tokens with the current document

                self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2)

                for token in self.overlapping_tokens[ref["id"]]:
                    ref_list=self.term_info[token]["references"]
                    if ref["id"] not in ref_list:
                        ref_list.append(ref["id"])

        # try to find some signal in the noise
        self.filterTokens()

        json_str="var token_data=" + json.dumps(self.overlapping_tokens) + ";"
        writeFileText(json_str, output_dir+guid+"_data.json")

        html=doc.prettyPrintDocumentHTML(
            True,
            True,
            False,
##            extra_attribute_function=self.extraAttributes,
            citation_formatting_function=self.citationFormatting,
            reference_formatting_function=self.referenceFormatting,
            text_formatting_function=self.textFormatting
            )
        html=self.padWithHTML(html, guid)
        writeFileText(html,output_dir+guid+"_vis.html")
def main():
##    explainZoning("P95-1026")

    docs=Corpus.listPapers("num_in_collection_references > 10 order by num_in_collection_references desc")
    generator=VisGenerator()
    generator.generateVisualization(docs[0])
Example #24
0
 def test_add_get(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           2,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.save_indexes()
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get(3), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 3
     }, u'Żółte źrebie'))
     self.assertEqual(d.get(1), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 1
     }, u'Gżegżółką jaźń'))
     self.assertEqual(d.get(2), ({
         'p1': 1,
         'p2': "2",
         'p3': [1, 2, 3, u'ą'],
         'id': 2
     }, u'Chrząszcz brzmi w czcinie'))
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #25
0
 def __init__(self, name = "wiki"):
   Corpus.__init__(self, name)
Example #26
0
 def test_iter(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie',
           1,
           p1=1,
           p2="2",
           p3=[1, 2, 3, u'ą'])
     c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1, 2, 3, u'ą'])
     c.save_indexes()
     d = Corpus('/tmp/TEST_CORPUS')
     l = []
     for t in d:
         l.append(t[0]['id'])
     self.assertEqual(l, [3, 1, 2])
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #27
0
 def __init__(self, name="wiki"):
     Corpus.__init__(self, name)
"""

"""

from corpora import Corpus
from nltk.corpus import PlaintextCorpusReader
import csv

corpus_path = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/test'
Corpus.create(corpus_path)
corpus = Corpus(corpus_path)

training_file_path = "/home/mayank/IdeaProjects/Lab_Machine_Learning/src/resources/TrainingData.csv"
reader = csv.reader(open(training_file_path, 'r'))

for (i, row) in enumerate(reader, 1):
    print i
    corpus.add(row[6].decode('utf-8'), i)
    if i == 10: break

print len(corpus)
print corpus.get()
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from corpora import Corpus


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="load files of scala file and convert it to a python " "corpus using a dictionary"
    )
    parser.add_argument("scala_file", help="python pickle file, containing tokens and metadata")
    parser.add_argument("dictionary")
    parser.add_argument("corpus")
    args = parser.parse_args()

    print("loading scala_file")
    corpus = Corpus.load(scala_file=args.scala_file, dictionary_file=args.dictionary)

    print("writing corpus to file")
    corpus.save(documents_file=args.corpus)
Example #30
0
def main():
##    explainZoning("P95-1026")

    docs=Corpus.listPapers("num_in_collection_references > 10 order by num_in_collection_references desc")
    generator=VisGenerator()
    generator.generateVisualization(docs[0])
Example #31
0
 def test_get_chunk(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertIsNotNone(c.get_chunk())
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #32
0
from corpora import Corpus

import logging
from logging.config import fileConfig

# Setup logging.
# fileConfig('log_config.ini')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
hdlr = logging.FileHandler('./similarity.log')
log = logging.getLogger()
log.addHandler(hdlr)

log.info('Start!')

corpuss = Corpus()
# corpuss.tokenize_bessarabia()
corpuss.tokenize_romania()
big_corpus = corpuss.tokenized_romania  #+ corpuss.tokenized_romania

# log.info('Big corpus: %s' % big_corpus)

# remove common words and tokenize
stoplist = set('această intru preste față făcut foarte fostu nóstre despre sale dara anulu inse alta cele sunt fara prin dupa cari aceasta sînt fără toate între după acii '\
'cãtre decît suntu dein loru dela numai voru catu totu suntu acésta celu inca pndia pana acésta' \
'tóte carea acesta tote candu intre dectu multu pote acestu nici tóte fost póte'.split())
log.info(stoplist)
texts = [[
    word for word in document.lower().split()
    if ((len(word) > 3) and (word.encode('utf-8') not in stoplist))
] for document in big_corpus]
Example #33
0
 def test_save_config(self):
     Corpus.create('/tmp/TEST_CORPUS', name=u"Fancy name")
     c = Corpus('/tmp/TEST_CORPUS')
     c.set_property('name', u"Not fancy")
     c.save_config()
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get_property('name'), u"Not fancy")
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #34
0
 def test_get_chunk(self):
     Corpus.create('/tmp/TEST_CORPUS', chunk_size=10)
     c = Corpus('/tmp/TEST_CORPUS')
     self.assertIsNotNone(c.get_chunk())
     del c
     shutil.rmtree('/tmp/TEST_CORPUS')
Example #35
0
 def test_add_get(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])  
     c.save_indexes()      
     d = Corpus('/tmp/TEST_CORPUS')
     self.assertEqual(d.get(3), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':3},  u'Żółte źrebie'   ) )
     self.assertEqual(d.get(1), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':1},  u'Gżegżółką jaźń'   ) )
     self.assertEqual(d.get(2), (  { 'p1':1, 'p2':"2", 'p3':[1,2,3,u'ą'], 'id':2},  u'Chrząszcz brzmi w czcinie'  ) )
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')         
Example #36
0
import logging
from logging.config import fileConfig

# Setup logging.
# fileConfig('log_config.ini')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
hdlr = logging.FileHandler('./similarity.log')
log = logging.getLogger()
log.addHandler(hdlr)

log.info('Start!')


corpuss = Corpus()
# corpuss.tokenize_bessarabia()
corpuss.tokenize_romania()
big_corpus = corpuss.tokenized_romania #+ corpuss.tokenized_romania

# log.info('Big corpus: %s' % big_corpus)

# remove common words and tokenize
stoplist = set('această intru preste față făcut foarte fostu nóstre despre sale dara anulu inse alta cele sunt fara prin dupa cari aceasta sînt fără toate între după acii '\
'cãtre decît suntu dein loru dela numai voru catu totu suntu acésta celu inca pndia pana acésta' \
'tóte carea acesta tote candu intre dectu multu pote acestu nici tóte fost póte'.split())
log.info(stoplist)
texts = [[word for word in document.lower().split() if ((len(word) > 3) and (word.encode('utf-8') not in stoplist))]
         for document in big_corpus]

# remove words that appear only once
Example #37
0
 def test_iter(self):
     Corpus.create('/tmp/TEST_CORPUS')
     c = Corpus('/tmp/TEST_CORPUS')
     c.add(u'Gżegżółką jaźń', 3, p1=1, p2="2", p3=[1,2,3,u'ą'])
     c.add(u'Chrząszcz brzmi w czcinie', 1, p1=1, p2="2", p3=[1,2,3,u'ą'])                
     c.add(u'Żółte źrebie', 2, p1=1, p2="2", p3=[1,2,3,u'ą'])  
     c.save_indexes()      
     d = Corpus('/tmp/TEST_CORPUS')
     l = []
     for t in d:
         l.append(t[0]['id'])
     self.assertEqual(l, [3,1,2])
     del c, d
     shutil.rmtree('/tmp/TEST_CORPUS')