''' Created on Sep 28, 2012 @author: inesmeya ''' from concept import Concept from model.database_wrapper import DatabaseWrapper from model.build_utils import build_word_index, build_index_by_words, build_df, build_wieght_table, build_wieght_table_dok from scipy.sparse import csr_matrix as matrix from model.math_utils import normalize from model.logger import getLogger _log = getLogger('db_builder') class DbBuilder(object): ''' This class builds a database (word inverted index) from wiki documents General usage is: * Repeat Add document * build the database When a document is added, it is converted to a Concept, which is just a bag of word representing the corresponding document. Each Concept has unique ID, and we do not allow Concept IDs duplications For testability - if the id is None, it will be auto generated. ''' def __init__(self, stemmer): ''' Constructor
#!/usr/bin/python # -*- coding: utf-8 -*- import xml.etree.ElementTree as etxml import StringIO from model.logger import getLogger _log = getLogger(__name__) class WdNames: CLEAN_TEXT = 'clean_text' LINKS = 'links' def doc_from_xml(doc_tag): doc = doc_tag.attrib wd = WikiDocument(doc['id'], doc['title'], None, doc['rev_id'], clean_text=doc_tag.text) return wd def wiki_doc_to_xml(wiki_doc): '''@param WikiDocument param: @return: String in xml format, represenation ''' el_doc = etxml.Element('doc') el_doc.attrib['id'] = str(wiki_doc.id)
#!/usr/bin/python # -*- coding: utf-8 -*- import bz2 from model.db_builder import DbBuilder from model import stemmers from model.wiki_doc import wiki_doc_to_xml from model.semantic_interpreter import SemanticComparer from parsers import web_tools from parsers import parse_tools import codecs from model.logger import getLogger _log = getLogger("WikiKnows") import os import pickle from model.database_wrapper import DatabaseWrapper, DbContant def ensure_dir(f): d = os.path.dirname(f) if not os.path.exists(d): os.makedirs(d) def make_dump(wiki_dump, articles_titles, compress=False): """ Download specified articles from Wikipedia site, merges them into one file, compresses it as Wikipedia dump file @param articles_titles: article's canonic name on Wikipedia web page @param wiki_dump: output filename (if not specified default is used) """ _log.debug("-"*80)
#!/usr/bin/python # -*- coding: utf-8 -*- import os import gzip import urllib2 from subprocess import Popen, PIPE import model.logger as lg _log = lg.getLogger( __name__) #lg.mainlog # logging.getLogger("webtools")#__name__) def ensure_dir(f): d = os.path.dirname(f) if len(d) == 0: return if not os.path.exists(d): os.makedirs(d) base_xml_url = "http://en.wikipedia.org/wiki/Special:Export/" def get_article_xml_url(article_title): url = base_xml_url + article_title return url def get_wiki_xmlpage_wget(article_title): url = get_article_xml_url(article_title)
from scipy.linalg import norm from scipy.sparse import csr_matrix as matrix import scipy.sparse as sps from math import log from model.logger import getLogger _log = getLogger(__name__) #TODO: use scipy.spatial.distance for metrics def cosine_metrics(v1, v2): # for same vectors if v1 == v2: return 1.0 # treat zero vectors denom = norm(v1.todense()) * norm(v2.todense()) if denom == 0.0: _log.warning("One of the vectors is zero!") return 0.0 _ similarity = v1.dot(v2.T) / denom return float(similarity[0,0]) def get_vectors_centroid(list_of_vectors): """ gets a list of scipy vectors with same dimensions and returns their centroid""" n = len(list_of_vectors) if n == 0: return #on 1d vector, shape holds the length shape = list_of_vectors[0].shape ret_vec = matrix(shape) for vector in list_of_vectors: #_log.debug("get_vectors_centroid: Adding vector {}".format(vector))
import unittest import os from wiki_knows import wiki_knowledge from model.stemmers import StopWordsStemmer from parsers import parse_tools import test.test_utils as test_utils from io_test_utils import getOutputFile from model import semantic_interpreter from model import logger _log = logger.getLogger(__name__) class Test(unittest.TestCase): def setUp(self): self.tmp_dump_file = getOutputFile("wiki_knowledge_output.xml") self.tmp_wdb_file = getOutputFile("wiki_knowledge_output.wdb") self.tmp_parse_file = getOutputFile("wiki_knowledge_output.parsed.xml") self.expected_articles = ['Knowledge', 'Love', 'War'] self.expected_xml_path = os.path.join(os.path.dirname(__file__) ,"expected_results/expected_xml_Knowledge_Love_War.xml") self.expected_wdb_path = os.path.join(os.path.dirname(__file__) ,"expected_results/expected_Knowledge_Love_War.wdb") def tearDown(self): pass def test__execution(self): """ This is not exactly a test, but a program execution...""" text1 = "i love to learn" text2 = "the world we know"
import unittest import os from wiki_knows import wiki_knowledge from model.stemmers import StopWordsStemmer from parsers import parse_tools import test.test_utils as test_utils from io_test_utils import getOutputFile from model import semantic_interpreter from model import logger _log = logger.getLogger(__name__) class Test(unittest.TestCase): def setUp(self): self.tmp_dump_file = getOutputFile("wiki_knowledge_output.xml") self.tmp_wdb_file = getOutputFile("wiki_knowledge_output.wdb") self.tmp_parse_file = getOutputFile("wiki_knowledge_output.parsed.xml") self.expected_articles = ['Knowledge', 'Love', 'War'] self.expected_xml_path = os.path.join( os.path.dirname(__file__), "expected_results/expected_xml_Knowledge_Love_War.xml") self.expected_wdb_path = os.path.join( os.path.dirname(__file__), "expected_results/expected_Knowledge_Love_War.wdb") def tearDown(self): pass
#!/usr/bin/python # -*- coding: utf-8 -*- import os import gzip import urllib2 from subprocess import Popen, PIPE import model.logger as lg _log = lg.getLogger(__name__) #lg.mainlog # logging.getLogger("webtools")#__name__) def ensure_dir(f): d = os.path.dirname(f) if len(d)== 0: return if not os.path.exists(d): os.makedirs(d) base_xml_url = "http://en.wikipedia.org/wiki/Special:Export/" def get_article_xml_url(article_title): url = base_xml_url + article_title return url def get_wiki_xmlpage_wget(article_title): url = get_article_xml_url(article_title) try: cmd = ["wget", '-qO-', '-S', url] p = Popen(cmd, stdout=PIPE, stderr=PIPE) except: "retry on macOS"
#!/usr/bin/python # -*- coding: utf-8 -*- import bz2 from model.db_builder import DbBuilder from model import stemmers from model.wiki_doc import wiki_doc_to_xml from model.semantic_interpreter import SemanticComparer from parsers import web_tools from parsers import parse_tools import codecs from model.logger import getLogger _log = getLogger("WikiKnows") import os import pickle from model.database_wrapper import DatabaseWrapper, DbContant def ensure_dir(f): d = os.path.dirname(f) if not os.path.exists(d): os.makedirs(d) def make_dump(wiki_dump, articles_titles, compress=False): """ Download specified articles from Wikipedia site, merges them into one file, compresses it as Wikipedia dump file @param articles_titles: article's canonic name on Wikipedia web page
''' Created on Sep 28, 2012 @author: inesmeya ''' from concept import Concept from model.database_wrapper import DatabaseWrapper from model.build_utils import build_word_index, build_index_by_words, build_df, build_wieght_table, build_wieght_table_dok from scipy.sparse import csr_matrix as matrix from model.math_utils import normalize from model.logger import getLogger _log = getLogger('db_builder') class DbBuilder(object): ''' This class builds a database (word inverted index) from wiki documents General usage is: * Repeat Add document * build the database When a document is added, it is converted to a Concept, which is just a bag of word representing the corresponding document. Each Concept has unique ID, and we do not allow Concept IDs duplications For testability - if the id is None, it will be auto generated. ''' def __init__(self, stemmer): '''