def __init__(self, DICT='models/abstract/abstracts.dict', TFIDF_MODEL='models/abstract/abstracts_tfidf.model', LSA_MODEL='models/abstract/abstracts_lsi.model'): try: from nltk.tokenize import RegexpTokenizer from nltk.stem import WordNetLemmatizer import nltk self._tokenizer = RegexpTokenizer(r'[a-z]+') self._lemma = WordNetLemmatizer() self._stopwords = set(nltk.corpus.stopwords.words('english')) except: print('Install NLTK and download WordNet!') import sys sys.exit() try: from gensim import corpora, models from sematch.utility import FileIO self._dict = corpora.Dictionary.load(FileIO.filename(DICT)) self._tfidf = models.TfidfModel.load(FileIO.filename(TFIDF_MODEL)) self._lsa = models.LsiModel.load(FileIO.filename(LSA_MODEL)) except: print('Install gensim and prepare models data!') import sys sys.exit()
def test_embedding(): from gensim.models import KeyedVectors from sematch.utility import FileIO from sematch.semantic.relatedness import WordRelatedness model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True) model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True) rel = WordRelatedness(model_news) print(rel.word_similarity('happy','sad'))
def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param vec_file: the file storing vectors :param binary: if vector are stored in binary. Google news use binary while yelp not """ self._wordvec = Word2Vec.load_word2vec_format(FileIO.filename(vec_file), binary=binary)
def test_category(): from gensim.models.doc2vec import Doc2Vec from sematch.utility import FileIO from sematch.semantic.relatedness import ConceptRelatedness model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec')) cat2vec_rel = ConceptRelatedness(model_category) print(cat2vec_rel.word_similarity('happy','sad'))
def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param vec_file: the file storing vectors :param binary: if vector are stored in binary. Google news use binary while yelp not """ self._wordvec = Word2Vec.load_word2vec_format( FileIO.filename(vec_file), binary=binary)
def __init__(self, src='models/dbpedia_2015-04.owl'): self.graph = rdflib.Graph() self.graph.parse(FileIO.filename(src)) self.root = 'http://www.w3.org/2002/07/owl#Thing' self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)] self.o_properties = [s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty)] self.d_properties = [s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty)] self.uri2class = {c.toPython():c for c in self.classes} self.uri2class[self.root] = rdflib.URIRef(self.root) self.class_labels = [self.token(c) for c in self.classes]
def load_dataset(self, dataset_file, cat_full=False): from BeautifulSoup import BeautifulSOAP as bs pairs = [] with open(FileIO.filename(dataset_file), 'r') as f: corpus = f.read() opinions = bs(corpus).findAll('opinion') for op in opinions: if not op['target'] == 'NULL': t = op['target'] c = op['category'] if cat_full else op['category'].split('#')[0] pairs.append((t, c)) X, y = zip(*pairs) return X, y
def __init__(self, src='models/dbpedia_2015-04.owl'): self.graph = rdflib.Graph() self.graph.parse(FileIO.filename(src)) self.root = 'http://www.w3.org/2002/07/owl#Thing' self.classes = [s for s in self.graph.subjects(RDF.type, OWL.Class)] self.o_properties = [ s for s in self.graph.subjects(RDF.type, OWL.ObjectProperty) ] self.d_properties = [ s for s in self.graph.subjects(RDF.type, OWL.DatatypeProperty) ] self.uri2class = {c.toPython(): c for c in self.classes} self.uri2class[self.root] = rdflib.URIRef(self.root) self.class_labels = [self.token(c) for c in self.classes]
def load_dataset(self, dataset_file, cat_full=False): from BeautifulSoup import BeautifulSOAP as bs pairs = [] with open(FileIO.filename(dataset_file), 'r') as f: corpus = f.read() opinions = bs(corpus).findAll('opinion') for op in opinions: if not op['target'] == 'NULL': t = op['target'] c = op['category'] if cat_full else op['category'].split( '#')[0] pairs.append((t, c)) X, y = zip(*pairs) return X, y
def load_stopwords(self, filename): data = FileIO.read_list_file(FileIO.filename(filename)) data = [d.split() for d in data[1:]] # skip first line, in case more than one word per line data = list(itertools.chain.from_iterable(data)) return data
def load_stopwords(self, filename): data = FileIO.read_list_file(FileIO.filename(filename)) data = [d.split() for d in data[1:]] # skip first line, in case more than one word per line data = list(itertools.chain.from_iterable(data)) return data