def test_generate_neighbor_list(self): corpus = Corpus([self.d1, self.d2, self.d3, self.d4]) l = corpus.generate_neighbor_list(self.d1) self.assertTrue(((l[0] == ("1", 0)) and (l[1] == ("3", 0))) or ((l[0] == ("3", 0)) and (l[1] == ("1", 0)))) self.assertEqual(l[2], ("4", 2)) self.assertEqual(l[3], ("2", 20))
def test_neighbors(self): corpus = Corpus([self.d1, self.d2, self.d3, self.d4]) neighbors = corpus.neighbors(self.d1, 10) n = [doc.doc_id for doc in neighbors] self.assertEqual(set(["1", "3", "4"]), set(n)) neighbors = corpus.neighbors(self.d1, 20) n = [doc.doc_id for doc in neighbors] self.assertEqual(set(["1", "2", "3", "4"]), set(n)) neighbors = corpus.neighbors(self.d1, 0) n = [doc.doc_id for doc in neighbors] self.assertEqual(set(["1", "3"]), set(n))
def __init__(self, output=None, corpus=None, attribute="categories", categories=None, mode="combined"): """ Create a CategoryToCorpus module, which loads a corpus with tagged documents. If corpus is passed in, it adds to an existing corpus. mode is the corpus loading method to use. If set to "combined", all documents in a category are concatenated to a single document. Otherwise each document is loaded separately. """ self.output = output self.corpora = {} # combined mode has a single corpus if corpus == None: self.corpus = Corpus() else: self.corpus = corpus self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess) self.attribute = attribute self.categories = categories self.mode = mode self.pp = pprint.PrettyPrinter(indent=4)
class AddToCorpus(PipelineModule): def __init__(self, output=None, corpus=None): self.output = output self.corpus = Corpus() if (corpus == None) else corpus self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess) def process(self, data): for document in data: print data self.corpus.add(data) def post_process(self): return self.corpus def as_json(self): json.dumps(self.corpus, sort_keys=True, indent=4, separators=(',', ': ')) def write(self): if self.output != None: f = open(self.output, 'w') f.write(self.as_json()) f.close()
class CategoryToCorpus(PipelineModule): def __init__(self, output=None, corpus=None, attribute="categories", categories=None, mode="combined"): """ Create a CategoryToCorpus module, which loads a corpus with tagged documents. If corpus is passed in, it adds to an existing corpus. mode is the corpus loading method to use. If set to "combined", all documents in a category are concatenated to a single document. Otherwise each document is loaded separately. """ self.output = output self.corpora = {} # combined mode has a single corpus if corpus == None: self.corpus = Corpus() else: self.corpus = corpus self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess) self.attribute = attribute self.categories = categories self.mode = mode self.pp = pprint.PrettyPrinter(indent=4) def add_document(self, category, document): if self.mode != "combined": if category in self.corpora: self.corpora[category].append(document) else: self.corpora[category] = [document] else: if category in self.corpus: d = self.corpus[category] d.update_text(unicode(d) + " " + unicode(document)) else: document.set_doc_id(category) self.corpus.add(document) def process(self, data): """ Process the documents. The code looks at the attribute attribute, which should be a list or dictionary, and builds a set of corpora from categories in that attribute. If category is set, it only builds a single corpus containing documents with that category. """ for doc in data: if self.attribute in doc.document: d = doc.document[self.attribute] if type(d) is list: if self.categories == None: for v in d: self.add_document(v, doc) else: for category in self.categories: if category in d: self.add_document(category, doc) yield doc def post_process(self): """ method that gets run after all data has been processed TODO: look into optimizing this, seems inefficient, written in derp-mode """ if self.mode != "combined": return self.corpora else: return self.corpus def as_json(self): if self.mode != "combined": c = self.corpora else: c = self.corpus json.dumps(c, sort_keys=True, indent=4, separators=(',', ': ')) def write(self): if self.output != None: f = open(self.output, 'w') f.write(self.as_json()) f.close() def top_categories(self, n=10): for doc_id in self.categories: print str(doc_id) rt = self.corpus.ranked_terms(doc_id, n) print " " + str(rt)
def setUp(self): self.d1 = Document({"id": "1", "body": "This this."}) self.d2 = Document({"id": "2", "body": "This is another test document."}) self.d3 = Document({"id": "3", "body": "Two words."}) self.d4 = Document({"id": "4", "body": "Three words."}) self.corpus = Corpus([self.d1, self.d2, self.d3])
class CorpusTestCase(unittest.TestCase): def setUp(self): self.d1 = Document({"id": "1", "body": "This this."}) self.d2 = Document({"id": "2", "body": "This is another test document."}) self.d3 = Document({"id": "3", "body": "Two words."}) self.d4 = Document({"id": "4", "body": "Three words."}) self.corpus = Corpus([self.d1, self.d2, self.d3]) def test_df(self): self.assertEqual(self.corpus.df("this"), 2) self.assertEqual(self.corpus.df("is"), 1) self.assertEqual(self.corpus.df("two"), 1) self.assertEqual(self.corpus.df("."), 3) def test_idf(self): # assume math.log is good self.assertEqual(self.corpus.idf("this"), math.log(3.0 / 2.0)) self.assertEqual(self.corpus.idf("is"), math.log(3.0 / 1.0)) self.assertEqual(self.corpus.idf("."), math.log(3.0 / 3.0)) def test_tf(self): self.assertEqual(self.corpus.tf("1", "this"), 2.0 / 3.0) self.assertEqual(self.corpus.tf("2", "this"), 1.0 / 6.0) self.assertEqual(self.corpus.tf("2", "is"), 1.0 / 6.0) self.assertEqual(self.corpus.tf("3", "."), 1.0 / 3.0) def test_tf_idf(self): self.assertEqual(self.corpus.tf_idf("1", "this"), (2.0 / 3.0) * math.log(3.0 / 2.0)) self.assertEqual(self.corpus.tf_idf("2", "this"), (1.0 / 6.0) * math.log(3.0 / 2.0)) self.assertEqual(self.corpus.tf_idf("2", "is"), (1.0 / 6.0) * math.log(3.0 / 1.0)) self.assertEqual(self.corpus.tf_idf("3", "."), (1.0 / 3.0) * math.log(3.0 / 3.0)) def test_vocabulary(self): v = self.corpus.vocabulary() self.assertEqual(v["."], 3) self.assertEqual(v["this"], 3) self.assertEqual(v["another"], 1) def test_generate_doc_lens(self): self.corpus.generate_doc_lens() result = { "1": 10, "2": 30, "3": 10 } self.assertEqual(self.corpus.doc_lens, result) def test_generate_neighbor_list(self): corpus = Corpus([self.d1, self.d2, self.d3, self.d4]) l = corpus.generate_neighbor_list(self.d1) self.assertTrue(((l[0] == ("1", 0)) and (l[1] == ("3", 0))) or ((l[0] == ("3", 0)) and (l[1] == ("1", 0)))) self.assertEqual(l[2], ("4", 2)) self.assertEqual(l[3], ("2", 20)) def test_neighbors(self): corpus = Corpus([self.d1, self.d2, self.d3, self.d4]) neighbors = corpus.neighbors(self.d1, 10) n = [doc.doc_id for doc in neighbors] self.assertEqual(set(["1", "3", "4"]), set(n)) neighbors = corpus.neighbors(self.d1, 20) n = [doc.doc_id for doc in neighbors] self.assertEqual(set(["1", "2", "3", "4"]), set(n)) neighbors = corpus.neighbors(self.d1, 0) n = [doc.doc_id for doc in neighbors] self.assertEqual(set(["1", "3"]), set(n))
def __init__(self, output=None, corpus=None): self.output = output self.corpus = Corpus() if (corpus == None) else corpus self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess)