Exemple #1
0
'''
Created on Sep 28, 2012

@author: inesmeya
'''

from concept import Concept 
from model.database_wrapper import DatabaseWrapper
from model.build_utils import build_word_index, build_index_by_words, build_df, build_wieght_table, build_wieght_table_dok
from scipy.sparse import csr_matrix as matrix
from model.math_utils import normalize 

from model.logger import getLogger
_log = getLogger('db_builder')

class DbBuilder(object):
    '''
    This class builds a database (word inverted index) from wiki documents 
    General usage is:
        * Repeat Add document
        * build the database
    When a document is added, it is converted to a Concept, which is just a bag of word representing the corresponding document.
    
    Each Concept has unique ID, and we do not allow Concept IDs duplications    
        For testability - if the id is None, it will be auto generated.
    
    '''

    def __init__(self, stemmer):
        '''
        Constructor
Exemple #2
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as etxml
import StringIO

from model.logger import getLogger
_log = getLogger(__name__)


class WdNames:
    CLEAN_TEXT = 'clean_text'
    LINKS = 'links'


def doc_from_xml(doc_tag):
    doc = doc_tag.attrib
    wd = WikiDocument(doc['id'],
                      doc['title'],
                      None,
                      doc['rev_id'],
                      clean_text=doc_tag.text)
    return wd


def wiki_doc_to_xml(wiki_doc):
    '''@param WikiDocument param:
       @return: String in xml format, represenation 
    '''
    el_doc = etxml.Element('doc')
    el_doc.attrib['id'] = str(wiki_doc.id)
Exemple #3
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
 
import bz2
from model.db_builder import DbBuilder
from model import stemmers
from model.wiki_doc import wiki_doc_to_xml
from model.semantic_interpreter import SemanticComparer

from parsers import web_tools
from parsers import parse_tools
import codecs

from model.logger import getLogger
_log = getLogger("WikiKnows")

import os
import pickle
from model.database_wrapper import DatabaseWrapper, DbContant
def ensure_dir(f):
    d = os.path.dirname(f)
    if not os.path.exists(d):
        os.makedirs(d)

def make_dump(wiki_dump, articles_titles, compress=False):
    """ Download specified articles from Wikipedia site, 
        merges them into one file, compresses it as Wikipedia dump file
        @param articles_titles: article's canonic name on Wikipedia web page
        @param wiki_dump: output filename (if not specified default is used)
    """
    _log.debug("-"*80)
Exemple #4
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import gzip
import urllib2
from subprocess import Popen, PIPE

import model.logger as lg
_log = lg.getLogger(
    __name__)  #lg.mainlog # logging.getLogger("webtools")#__name__)


def ensure_dir(f):
    d = os.path.dirname(f)
    if len(d) == 0:
        return
    if not os.path.exists(d):
        os.makedirs(d)


base_xml_url = "http://en.wikipedia.org/wiki/Special:Export/"


def get_article_xml_url(article_title):
    url = base_xml_url + article_title
    return url


def get_wiki_xmlpage_wget(article_title):
    url = get_article_xml_url(article_title)
Exemple #5
0
from scipy.linalg import norm
from scipy.sparse import csr_matrix as matrix
import scipy.sparse as sps
from math import log

from model.logger import getLogger
_log = getLogger(__name__)

#TODO: use scipy.spatial.distance for metrics  
def cosine_metrics(v1, v2):
    # for same vectors
    if v1 == v2: return 1.0
    
    # treat zero vectors
    denom = norm(v1.todense()) * norm(v2.todense()) 
    if denom == 0.0:
        _log.warning("One of the vectors is zero!")
        return 0.0
        _
    similarity  = v1.dot(v2.T) / denom
    return float(similarity[0,0])

def get_vectors_centroid(list_of_vectors):
    """ gets a list of scipy vectors with same dimensions and returns their centroid"""
    n = len(list_of_vectors)
    if n == 0: return
    #on 1d vector, shape holds the length
    shape = list_of_vectors[0].shape
    ret_vec = matrix(shape)
    for vector in list_of_vectors:
        #_log.debug("get_vectors_centroid: Adding vector {}".format(vector))
import unittest
import os

from wiki_knows import wiki_knowledge 
from model.stemmers import StopWordsStemmer
from parsers import parse_tools
import test.test_utils as test_utils  
from io_test_utils import getOutputFile 
from model import semantic_interpreter

from model import logger
_log = logger.getLogger(__name__)

class Test(unittest.TestCase):
    def setUp(self):
        self.tmp_dump_file = getOutputFile("wiki_knowledge_output.xml")
        self.tmp_wdb_file = getOutputFile("wiki_knowledge_output.wdb")
        self.tmp_parse_file = getOutputFile("wiki_knowledge_output.parsed.xml")
        
        self.expected_articles = ['Knowledge', 'Love', 'War'] 
        self.expected_xml_path = os.path.join(os.path.dirname(__file__) ,"expected_results/expected_xml_Knowledge_Love_War.xml")
        self.expected_wdb_path = os.path.join(os.path.dirname(__file__) ,"expected_results/expected_Knowledge_Love_War.wdb")
        
    def tearDown(self):
        pass
        
    def test__execution(self):
        """ This is not exactly a test, but a program execution..."""
        text1 = "i love to learn"
        text2 = "the world we know"
import unittest
import os

from wiki_knows import wiki_knowledge
from model.stemmers import StopWordsStemmer
from parsers import parse_tools
import test.test_utils as test_utils
from io_test_utils import getOutputFile
from model import semantic_interpreter

from model import logger
_log = logger.getLogger(__name__)


class Test(unittest.TestCase):
    def setUp(self):
        self.tmp_dump_file = getOutputFile("wiki_knowledge_output.xml")
        self.tmp_wdb_file = getOutputFile("wiki_knowledge_output.wdb")
        self.tmp_parse_file = getOutputFile("wiki_knowledge_output.parsed.xml")

        self.expected_articles = ['Knowledge', 'Love', 'War']
        self.expected_xml_path = os.path.join(
            os.path.dirname(__file__),
            "expected_results/expected_xml_Knowledge_Love_War.xml")
        self.expected_wdb_path = os.path.join(
            os.path.dirname(__file__),
            "expected_results/expected_Knowledge_Love_War.wdb")

    def tearDown(self):
        pass
Exemple #8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import gzip
import urllib2 
from subprocess import Popen, PIPE

import model.logger as lg
_log = lg.getLogger(__name__)  #lg.mainlog # logging.getLogger("webtools")#__name__)

def ensure_dir(f):
    d = os.path.dirname(f)
    if len(d)== 0:
        return
    if not os.path.exists(d):
        os.makedirs(d)

base_xml_url = "http://en.wikipedia.org/wiki/Special:Export/"

def get_article_xml_url(article_title):
    url = base_xml_url + article_title
    return url

def get_wiki_xmlpage_wget(article_title):
    url = get_article_xml_url(article_title)
    try:
        cmd = ["wget", '-qO-', '-S', url]
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
    except:
        "retry on macOS"
Exemple #9
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import bz2
from model.db_builder import DbBuilder
from model import stemmers
from model.wiki_doc import wiki_doc_to_xml
from model.semantic_interpreter import SemanticComparer

from parsers import web_tools
from parsers import parse_tools
import codecs

from model.logger import getLogger
_log = getLogger("WikiKnows")

import os
import pickle
from model.database_wrapper import DatabaseWrapper, DbContant


def ensure_dir(f):
    d = os.path.dirname(f)
    if not os.path.exists(d):
        os.makedirs(d)


def make_dump(wiki_dump, articles_titles, compress=False):
    """ Download specified articles from Wikipedia site, 
        merges them into one file, compresses it as Wikipedia dump file
        @param articles_titles: article's canonic name on Wikipedia web page
Exemple #10
0
'''
Created on Sep 28, 2012

@author: inesmeya
'''

from concept import Concept
from model.database_wrapper import DatabaseWrapper
from model.build_utils import build_word_index, build_index_by_words, build_df, build_wieght_table, build_wieght_table_dok
from scipy.sparse import csr_matrix as matrix
from model.math_utils import normalize

from model.logger import getLogger

_log = getLogger('db_builder')


class DbBuilder(object):
    '''
    This class builds a database (word inverted index) from wiki documents 
    General usage is:
        * Repeat Add document
        * build the database
    When a document is added, it is converted to a Concept, which is just a bag of word representing the corresponding document.
    
    Each Concept has unique ID, and we do not allow Concept IDs duplications    
        For testability - if the id is None, it will be auto generated.
    
    '''
    def __init__(self, stemmer):
        '''