Beispiel #1
0
 def __init__(self,
              DICT='models/abstract/abstracts.dict',
              TFIDF_MODEL='models/abstract/abstracts_tfidf.model',
              LSA_MODEL='models/abstract/abstracts_lsi.model'):
     try:
         from nltk.tokenize import RegexpTokenizer
         from nltk.stem import WordNetLemmatizer
         import nltk
         self._tokenizer = RegexpTokenizer(r'[a-z]+')
         self._lemma = WordNetLemmatizer()
         self._stopwords = set(nltk.corpus.stopwords.words('english'))
     except:
         print('Install NLTK and download WordNet!')
         import sys
         sys.exit()
     try:
         from gensim import corpora, models
         from sematch.utility import FileIO
         self._dict = corpora.Dictionary.load(FileIO.filename(DICT))
         self._tfidf = models.TfidfModel.load(FileIO.filename(TFIDF_MODEL))
         self._lsa = models.LsiModel.load(FileIO.filename(LSA_MODEL))
     except:
         print('Install gensim and prepare models data!')
         import sys
         sys.exit()
Beispiel #2
0
def test_embedding():
    from gensim.models import KeyedVectors
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import WordRelatedness
    model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True)
    model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True)
    rel = WordRelatedness(model_news)
    print(rel.word_similarity('happy','sad'))
Beispiel #3
0
    def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True):
        """

        :param vec_file: the file storing vectors
        :param binary: if vector are stored in binary. Google news use binary while yelp not
        """
        self._wordvec = Word2Vec.load_word2vec_format(FileIO.filename(vec_file), binary=binary)
Beispiel #4
0
def test_category():
    from gensim.models.doc2vec import Doc2Vec
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import ConceptRelatedness
    model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec'))
    cat2vec_rel = ConceptRelatedness(model_category)
    print(cat2vec_rel.word_similarity('happy','sad'))
Beispiel #5
0
    def __init__(self,
                 vec_file='models/GoogleNews-vectors-negative300.bin',
                 binary=True):
        """

        :param vec_file: the file storing vectors
        :param binary: if vector are stored in binary. Google news use binary while yelp not
        """
        self._wordvec = Word2Vec.load_word2vec_format(
            FileIO.filename(vec_file), binary=binary)
Beispiel #6
0
 def __init__(self, src='models/dbpedia_2015-04.owl'):
     self.graph = rdflib.Graph()
     self.graph.parse(FileIO.filename(src))
     self.root = 'http://www.w3.org/2002/07/owl#Thing'
     self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)]
     self.o_properties = [s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty)]
     self.d_properties = [s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty)]
     self.uri2class = {c.toPython():c for c in self.classes}
     self.uri2class[self.root] = rdflib.URIRef(self.root)
     self.class_labels = [self.token(c) for c in self.classes]
Beispiel #7
0
 def load_dataset(self, dataset_file, cat_full=False):
     from BeautifulSoup import BeautifulSOAP as bs
     pairs = []
     with open(FileIO.filename(dataset_file), 'r') as f:
         corpus = f.read()
         opinions = bs(corpus).findAll('opinion')
         for op in opinions:
             if not op['target'] == 'NULL':
                 t = op['target']
                 c = op['category'] if cat_full else op['category'].split('#')[0]
                 pairs.append((t, c))
     X, y = zip(*pairs)
     return X, y
Beispiel #8
0
 def __init__(self, src='models/dbpedia_2015-04.owl'):
     self.graph = rdflib.Graph()
     self.graph.parse(FileIO.filename(src))
     self.root = 'http://www.w3.org/2002/07/owl#Thing'
     self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)]
     self.o_properties = [
         s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty)
     ]
     self.d_properties = [
         s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty)
     ]
     self.uri2class = {c.toPython(): c for c in self.classes}
     self.uri2class[self.root] = rdflib.URIRef(self.root)
     self.class_labels = [self.token(c) for c in self.classes]
Beispiel #9
0
 def load_dataset(self, dataset_file, cat_full=False):
     from BeautifulSoup import BeautifulSOAP as bs
     pairs = []
     with open(FileIO.filename(dataset_file), 'r') as f:
         corpus = f.read()
         opinions = bs(corpus).findAll('opinion')
         for op in opinions:
             if not op['target'] == 'NULL':
                 t = op['target']
                 c = op['category'] if cat_full else op['category'].split(
                     '#')[0]
                 pairs.append((t, c))
     X, y = zip(*pairs)
     return X, y
Beispiel #10
0
 def load_stopwords(self, filename):
     data = FileIO.read_list_file(FileIO.filename(filename))
     data = [d.split() for d in data[1:]] # skip first line, in case more than one word per line
     data = list(itertools.chain.from_iterable(data))
     return data
Beispiel #11
0
 def load_stopwords(self, filename):
     data = FileIO.read_list_file(FileIO.filename(filename))
     data = [d.split() for d in data[1:]] # skip first line, in case more than one word per line
     data = list(itertools.chain.from_iterable(data))
     return data