def _similarity(self, sentences): """ N-gram with similarity. The NGram class extends the Python ‘set’ class with efficient fuzzy search for members by means of an N-gram similarity measure. Reference: Vacláv Chvátal and David Sankoff. Longest common subsequences of two random sequences, 1975. Journal of Applied Probability, Python module: ngram (https://pypi.org/project/ngram/) """ ngram = NGram(self.corpus.split(), key=lambda x: x.lower(), N=self.N) predicts = [] if not isinstance(sentences, list): sentences = [sentences] for i in range(len(sentences)): split = [] for x in sentences[i].split(): sugg = ngram.find( x.lower()) if x not in string.punctuation else None split.append(sugg if sugg else x) predicts.append(" ".join(split)) return predicts
class PMI(object): """docstring for PMI""" def __init__(self, lang, uri=None, ngram=False): self.client = pymongo.MongoClient(uri) self.uri = uri self.lang = lang self.db = self.client['nlp_{}'.format(self.lang)] self.fs = gridfs.GridFS(self.db) self.Collect = self.db['pmi'] self.cpus = math.ceil(mp.cpu_count() * 0.2) self.frequency = {} if ngram: # use ngram for searching self.pmiNgram = NGram((i['key'] for i in self.db['pmi'].find({}, { 'key': 1, '_id': False }))) def getWordFreqItems(self): # use cache if os.path.exists('frequency.pkl'): self.frequency = pickle.load(open('frequency.pkl', 'rb')) frequency_of_total_keyword = pickle.load( open('frequency_of_total_keyword.pkl', 'rb')) return frequency_of_total_keyword # return all frequency of word in type of dict. self.frequency = {} frequency_of_total_keyword = 0 # iterate through gridFS for keyword in self.fs.list(): cursor = self.fs.find({"filename": keyword})[0] value = { 'PartOfSpeech': cursor.contentType, 'value': json.loads(self.fs.get(cursor._id).read().decode('utf-8')) } for correlation_keyword, PartOfSpeech, corTermCount in value[ 'value']: frequency_of_total_keyword += corTermCount # accumulate keyword's frequency. self.frequency[keyword] = self.frequency.setdefault( keyword, 0) + corTermCount # iterate through all normal collection for i in self.db['kcm'].find({}): keyword = i['key'] for correlation_keyword, PartOfSpeech, corTermCount in i['value']: frequency_of_total_keyword += corTermCount # accumulate keyword's frequency. self.frequency[keyword] = self.frequency.setdefault( keyword, 0) + corTermCount pickle.dump(self.frequency, open('frequency.pkl', 'wb')) pickle.dump(frequency_of_total_keyword, open('frequency_of_total_keyword.pkl', 'wb')) return frequency_of_total_keyword def build(self): self.Collect.remove({}) # read all frequency from KCM and build all PMI of KCM in MongoDB. # with format {key:'中興大學', freq:100, value:[(keyword, PMI-value), (keyword, PMI-value)...]} frequency_of_total_keyword = self.getWordFreqItems() print('frequency of total keyword:' + str(frequency_of_total_keyword)) @graceful_auto_reconnect def process_job(job_list): # Each process need independent Mongo Client # or it may raise Deadlock in Mongo. client = pymongo.MongoClient(self.uri) db = client['nlp_{}'.format(self.lang)] process_collect = db['pmi'] kcm_collect = db['kcm'] fs = gridfs.GridFS(db) result = [] for keyword, keyword_freq in job_list: pmiResult = [] collection_cursor = kcm_collect.find({ 'key': keyword }, { 'value': 1, '_id': False }).limit(1) if collection_cursor.count() == 0: gridfs_cursor = fs.find({"filename": keyword}).limit(1)[0] cursor_result = json.loads( fs.get(gridfs_cursor._id).read().decode('utf-8'))[:500] else: cursor_result = collection_cursor[0]['value'] for kcmKeyword, PartOfSpeech, kcmCount in cursor_result: # algorithm: # PMI = log2(p(x, y)/p(x)*p(y)) # p(x, y) = frequency of (x, y) / frequency of total keyword. # p(x) = frequency of x / frequency of total keyword. value = math.log2( kcmCount * frequency_of_total_keyword / (keyword_freq * self.frequency[kcmKeyword])) # this equation is contributed by 陳聖軒. # contact him with facebook: https://www.facebook.com/henrymayday value *= math.log2(self.frequency[kcmKeyword]) pmiResult.append((kcmKeyword, value)) pmiResult = sorted(pmiResult, key=lambda x: -x[1]) result.append({ 'key': keyword, 'freq': keyword_freq, 'value': pmiResult }) # Insert Collections into MongoDB if len(result) > 5000: process_collect.insert(result) result = [] amount = math.ceil(len(self.frequency) / self.cpus) job_list = list(self.frequency.items()) job_list = [ job_list[i:i + amount] for i in range(0, len(self.frequency), amount) ] processes = [ mp.Process(target=process_job, kwargs={'job_list': job_list[i]}) for i in range(self.cpus) ] for process in processes: process.start() for process in processes: process.join() self.Collect.create_index([("key", pymongo.HASHED)]) def get(self, keyword, amount): cursor = self.Collect.find({ 'key': keyword }, { 'value': 1, '_id': False }).limit(1) if cursor.count() != 0: return { 'key': keyword, 'value': cursor[0]['value'][:amount], 'similarity': 1 } else: pmiNgramKeyword = self.pmiNgram.find(keyword) if pmiNgramKeyword: result = self.Collect.find({ 'key': pmiNgramKeyword }, { 'value': 1, '_id': False }).limit(1)[0]['value'][:amount] return { 'key': pmiNgramKeyword, 'value': result, 'similarity': self.pmiNgram.compare(pmiNgramKeyword, keyword) } return {}
import csv from ngram import NGram records = NGram() with open('./data/houses.csv', 'r', encoding='windows-1251') as f: for line in csv.reader(f, delimiter=';'): records.add(' '.join(list(line)).lower()) while True: print('Enter search text:') search_text = input().lower() print('find', records.find(search_text), 0.8)
class KCM(object): """docstring for KCM""" '''args lang help='language, english or chinese (eng/cht)', required=True) io_dir help='input output directory, required=True) max_file_count help='maximum number of input files, 0 for no limit, type=int, default=0) thread_count help='number of thread used, type=int, default=1) ''' def __init__(self, lang, io_dir, max_file_count=0, thread_count=1, uri=None): self.BASE_DIR = BASE_DIR self.lang = lang self.io_dir = os.path.join(io_dir, self.lang) self.max_file_count = max_file_count self.thread_count = thread_count self.client = MongoClient(uri) self.db = self.client['nlp'] self.Collect = self.db['kcm'] # ngram search self.kcmNgram = NGram((i['key'] for i in self.Collect.find({}, { 'key': 1, '_id': False }))) logging.basicConfig(format='%(levelname)s : %(asctime)s : %(message)s', filename='KCM_{}.log'.format(self.lang), level=logging.INFO) logging.info('Begin gen_kcm.py') logging.info('input {self.max_file_count} files, ' 'output to {self.io_dir}, ' 'maximum file count {self.max_file_count}, ' 'use {self.thread_count} threads'.format(**locals())) def get_source_file_list(self): """Generate list of term data source files Args: args: input arguments, use self.lang, self.max_file_count Returns: list of source files """ file_list = [] # wiki files for (dir_path, dir_names, file_names) in os.walk(self.io_dir): for file_name in file_names: if self.max_file_count and len( file_list) >= self.max_file_count: break if file_name == '.DS_Store' or '.model' in file_name: # for OS X continue file_list.append(os.path.join(dir_path, file_name)) logging.info('appended file {}'.format( os.path.join(dir_path, file_name))) if not file_list: logging.info('no file selected, end of script') exit() return file_list @timing def remove_symbols_tags(self, if_name): """Remove symbols and tags. Read input file, output to output file. Args: if_name: input file name args: input arguments, use self.io_dir, self.lang Returns: output file name """ return rm_tags(if_name) @timing def paragraphs_to_sentences(self, inputData): """Generate sentences from paragraphs. Read input file, output to output file Args: inputData: input data from former process. args: input arguments, use self.io_dir, self.lang Returns: output file name """ return paragraphs_to_sentences_cht(inputData) @timing def sentences_to_terms(self, if_name, inputData): """generate terms from sentences Args: if_name: input file name args: input arguments, use self.io_dir, self.lang Returns: output file name """ prefix = if_name.split('/')[-1].split('_')[0] of_name = '{self.io_dir}/{prefix}_terms_{self.lang}'.format(**locals()) PosTokenizer(self.BASE_DIR, inputData, of_name, 'r') return of_name @removeInputFile @timing def terms_to_term_pairs(self, if_name): """Generate term pairs from terms. Args: if_name: input file name args: input arguments, use self.io_dir, self.lang Returns: output file name """ of_name = '{self.io_dir}/{self.lang}.model'.format(**locals()) script_file = 'build/terms_to_term_pair_freq.py' terms_to_term_pair_freq(if_name, of_name, min_freq=1, max_term_len=20) @timing def join_terms_files(self, if_names): """Join terms files into one Args: if_names: input terms files names args: input arguments """ of_name = '{self.io_dir}/terms_{self.lang}'.format(**locals()) with open(of_name, 'w') as output_file: for if_name in if_names: with open(if_name, 'r') as input_file: for line in input_file: output_file.write(line) return of_name def gen_terms_file(self, if_name, o_list): """Generate terms file Args: if_name: input wiki source file name args: input arguments o_list: output list saving generated file name """ result = self.remove_symbols_tags(if_name) result = self.paragraphs_to_sentences(result) of_name = self.sentences_to_terms(if_name, result) o_list.append(of_name) def thread_job(self, input_file_queue, o_term_files): """Job to be done by thread (generate terms file) Args: input_file_queue: queue containing input files that needs processing args: input arguments o_term_files: list for outputting generated term file names """ while True: if_name = input_file_queue.get() if if_name is None: break # end of thread self.gen_terms_file(if_name, o_list=o_term_files) input_file_queue.task_done() @timing def main(self): """main function""" if_list = self.get_source_file_list() term_files = [] input_file_queue = queue.Queue() threads = [] for i in range(self.thread_count): t = Thread(target=self.thread_job, args=(input_file_queue, term_files)) t.start() threads.append(t) for if_name in if_list: input_file_queue.put(if_name) # block till all tasks are done (here means all input file processed) input_file_queue.join() # stop all threads for i in range(self.thread_count): input_file_queue.put(None) # in thread_job, when input_file_queue.get == None, thread will end for t in threads: t.join() # wait till all threads really end of_name = self.join_terms_files(term_files) self.terms_to_term_pairs(of_name) self.import2DB() def setLang(self, lang): self.lang = lang self.io_dir = os.path.join(io_dir, self.lang) def removeDB(self): self.Collect.remove({}) def import2DB(self): result = dict() with open(os.path.join(self.io_dir, "{0}.model".format(self.lang)), 'r', encoding='utf8') as f: for i in f: tmp = i.split() result.setdefault(tmp[0], []).append([tmp[1], int(tmp[2])]) result.setdefault(tmp[1], []).append([tmp[0], int(tmp[2])]) documentArr = [{ 'key': pair[0], 'value': sorted(pair[1], key=lambda x: -x[1]) } for pair in result.items()] del result self.Collect.insert(documentArr) self.Collect.create_index([("key", pymongo.HASHED)]) def get(self, keyword, amount): result = self.Collect.find({'key': keyword}, {'_id': False}).limit(1) if result.count(): return {**(result[0]), 'similarity': 1} else: ngramKeyword = self.kcmNgram.find(keyword) if ngramKeyword: result = self.Collect.find({ 'key': ngramKeyword }, { '_id': False }).limit(1) return { 'key': ngramKeyword, 'value': result[0]['value'][:amount], 'similarity': self.kcmNgram.compare(keyword, ngramKeyword) } return {'key': ngramKeyword, 'value': [], 'similarity': 0}
def get_similar(data, target): G = NGram(target) return G.find(data)