Exemple #1
0
    def _similarity(self, sentences):
        """
        N-gram with similarity.

        The NGram class extends the Python ‘set’ class with efficient fuzzy search for members by
        means of an N-gram similarity measure.

        Reference:
            Vacláv Chvátal and David Sankoff.
            Longest common subsequences of two random sequences, 1975.
            Journal of Applied Probability,

            Python module: ngram (https://pypi.org/project/ngram/)
        """

        ngram = NGram(self.corpus.split(), key=lambda x: x.lower(), N=self.N)
        predicts = []

        if not isinstance(sentences, list):
            sentences = [sentences]

        for i in range(len(sentences)):
            split = []

            for x in sentences[i].split():
                sugg = ngram.find(
                    x.lower()) if x not in string.punctuation else None
                split.append(sugg if sugg else x)

            predicts.append(" ".join(split))

        return predicts
Exemple #2
0
class PMI(object):
    """docstring for PMI"""
    def __init__(self, lang, uri=None, ngram=False):
        self.client = pymongo.MongoClient(uri)
        self.uri = uri
        self.lang = lang
        self.db = self.client['nlp_{}'.format(self.lang)]
        self.fs = gridfs.GridFS(self.db)

        self.Collect = self.db['pmi']
        self.cpus = math.ceil(mp.cpu_count() * 0.2)
        self.frequency = {}

        if ngram:
            # use ngram for searching
            self.pmiNgram = NGram((i['key']
                                   for i in self.db['pmi'].find({}, {
                                       'key': 1,
                                       '_id': False
                                   })))

    def getWordFreqItems(self):
        # use cache
        if os.path.exists('frequency.pkl'):
            self.frequency = pickle.load(open('frequency.pkl', 'rb'))
            frequency_of_total_keyword = pickle.load(
                open('frequency_of_total_keyword.pkl', 'rb'))
            return frequency_of_total_keyword

        # return all frequency of word in type of dict.
        self.frequency = {}
        frequency_of_total_keyword = 0

        # iterate through gridFS
        for keyword in self.fs.list():
            cursor = self.fs.find({"filename": keyword})[0]
            value = {
                'PartOfSpeech': cursor.contentType,
                'value':
                json.loads(self.fs.get(cursor._id).read().decode('utf-8'))
            }
            for correlation_keyword, PartOfSpeech, corTermCount in value[
                    'value']:
                frequency_of_total_keyword += corTermCount
                # accumulate keyword's frequency.
                self.frequency[keyword] = self.frequency.setdefault(
                    keyword, 0) + corTermCount

        # iterate through all normal collection
        for i in self.db['kcm'].find({}):
            keyword = i['key']
            for correlation_keyword, PartOfSpeech, corTermCount in i['value']:
                frequency_of_total_keyword += corTermCount
                # accumulate keyword's frequency.
                self.frequency[keyword] = self.frequency.setdefault(
                    keyword, 0) + corTermCount

        pickle.dump(self.frequency, open('frequency.pkl', 'wb'))
        pickle.dump(frequency_of_total_keyword,
                    open('frequency_of_total_keyword.pkl', 'wb'))
        return frequency_of_total_keyword

    def build(self):
        self.Collect.remove({})
        # read all frequency from KCM and build all PMI of KCM in MongoDB.
        # with format {key:'中興大學', freq:100, value:[(keyword, PMI-value), (keyword, PMI-value)...]}
        frequency_of_total_keyword = self.getWordFreqItems()
        print('frequency of total keyword:' + str(frequency_of_total_keyword))

        @graceful_auto_reconnect
        def process_job(job_list):
            # Each process need independent Mongo Client
            # or it may raise Deadlock in Mongo.
            client = pymongo.MongoClient(self.uri)
            db = client['nlp_{}'.format(self.lang)]
            process_collect = db['pmi']
            kcm_collect = db['kcm']
            fs = gridfs.GridFS(db)

            result = []
            for keyword, keyword_freq in job_list:
                pmiResult = []

                collection_cursor = kcm_collect.find({
                    'key': keyword
                }, {
                    'value': 1,
                    '_id': False
                }).limit(1)
                if collection_cursor.count() == 0:
                    gridfs_cursor = fs.find({"filename": keyword}).limit(1)[0]
                    cursor_result = json.loads(
                        fs.get(gridfs_cursor._id).read().decode('utf-8'))[:500]
                else:
                    cursor_result = collection_cursor[0]['value']
                for kcmKeyword, PartOfSpeech, kcmCount in cursor_result:
                    # algorithm:
                    # PMI = log2(p(x, y)/p(x)*p(y))
                    # p(x, y) = frequency of (x, y) / frequency of total keyword.
                    # p(x) = frequency of x / frequency of total keyword.
                    value = math.log2(
                        kcmCount * frequency_of_total_keyword /
                        (keyword_freq * self.frequency[kcmKeyword]))

                    # this equation is contributed by 陳聖軒.
                    # contact him with facebook: https://www.facebook.com/henrymayday
                    value *= math.log2(self.frequency[kcmKeyword])

                    pmiResult.append((kcmKeyword, value))

                pmiResult = sorted(pmiResult, key=lambda x: -x[1])
                result.append({
                    'key': keyword,
                    'freq': keyword_freq,
                    'value': pmiResult
                })

                # Insert Collections into MongoDB
                if len(result) > 5000:
                    process_collect.insert(result)
                    result = []

        amount = math.ceil(len(self.frequency) / self.cpus)
        job_list = list(self.frequency.items())
        job_list = [
            job_list[i:i + amount]
            for i in range(0, len(self.frequency), amount)
        ]
        processes = [
            mp.Process(target=process_job, kwargs={'job_list': job_list[i]})
            for i in range(self.cpus)
        ]
        for process in processes:
            process.start()
        for process in processes:
            process.join()
        self.Collect.create_index([("key", pymongo.HASHED)])

    def get(self, keyword, amount):
        cursor = self.Collect.find({
            'key': keyword
        }, {
            'value': 1,
            '_id': False
        }).limit(1)
        if cursor.count() != 0:
            return {
                'key': keyword,
                'value': cursor[0]['value'][:amount],
                'similarity': 1
            }
        else:
            pmiNgramKeyword = self.pmiNgram.find(keyword)
            if pmiNgramKeyword:
                result = self.Collect.find({
                    'key': pmiNgramKeyword
                }, {
                    'value': 1,
                    '_id': False
                }).limit(1)[0]['value'][:amount]
                return {
                    'key': pmiNgramKeyword,
                    'value': result,
                    'similarity':
                    self.pmiNgram.compare(pmiNgramKeyword, keyword)
                }
        return {}
Exemple #3
0
import csv

from ngram import NGram

records = NGram()

with open('./data/houses.csv', 'r', encoding='windows-1251') as f:
    for line in csv.reader(f, delimiter=';'):
        records.add(' '.join(list(line)).lower())

while True:
    print('Enter search text:')
    search_text = input().lower()
    print('find', records.find(search_text), 0.8)

Exemple #4
0
class KCM(object):
    """docstring for KCM"""
    '''args
    lang                help='language, english or chinese (eng/cht)', required=True)
    io_dir             help='input output directory, required=True)
    max_file_count      help='maximum number of input files, 0 for no limit, type=int, default=0)
    thread_count        help='number of thread used, type=int, default=1)
    '''
    def __init__(self,
                 lang,
                 io_dir,
                 max_file_count=0,
                 thread_count=1,
                 uri=None):
        self.BASE_DIR = BASE_DIR
        self.lang = lang
        self.io_dir = os.path.join(io_dir, self.lang)
        self.max_file_count = max_file_count
        self.thread_count = thread_count

        self.client = MongoClient(uri)
        self.db = self.client['nlp']
        self.Collect = self.db['kcm']

        # ngram search
        self.kcmNgram = NGram((i['key'] for i in self.Collect.find({}, {
            'key': 1,
            '_id': False
        })))
        logging.basicConfig(format='%(levelname)s : %(asctime)s : %(message)s',
                            filename='KCM_{}.log'.format(self.lang),
                            level=logging.INFO)
        logging.info('Begin gen_kcm.py')
        logging.info('input {self.max_file_count} files, '
                     'output to {self.io_dir}, '
                     'maximum file count {self.max_file_count}, '
                     'use {self.thread_count} threads'.format(**locals()))

    def get_source_file_list(self):
        """Generate list of term data source files

        Args:
            args: input arguments, use self.lang, self.max_file_count

        Returns:
            list of source files
        """

        file_list = []  # wiki files
        for (dir_path, dir_names, file_names) in os.walk(self.io_dir):
            for file_name in file_names:
                if self.max_file_count and len(
                        file_list) >= self.max_file_count:
                    break
                if file_name == '.DS_Store' or '.model' in file_name:  # for OS X
                    continue
                file_list.append(os.path.join(dir_path, file_name))
                logging.info('appended file {}'.format(
                    os.path.join(dir_path, file_name)))

        if not file_list:
            logging.info('no file selected, end of script')
            exit()

        return file_list

    @timing
    def remove_symbols_tags(self, if_name):
        """Remove symbols and tags. Read input file, output to output file.

        Args:
            if_name: input file name
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        return rm_tags(if_name)

    @timing
    def paragraphs_to_sentences(self, inputData):
        """Generate sentences from paragraphs. Read input file, output to output file

        Args:
            inputData: input data from former process.
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        return paragraphs_to_sentences_cht(inputData)

    @timing
    def sentences_to_terms(self, if_name, inputData):
        """generate terms from sentences

        Args:
            if_name: input file name
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        prefix = if_name.split('/')[-1].split('_')[0]
        of_name = '{self.io_dir}/{prefix}_terms_{self.lang}'.format(**locals())
        PosTokenizer(self.BASE_DIR, inputData, of_name, 'r')

        return of_name

    @removeInputFile
    @timing
    def terms_to_term_pairs(self, if_name):
        """Generate term pairs from terms.

        Args:
            if_name: input file name
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        of_name = '{self.io_dir}/{self.lang}.model'.format(**locals())
        script_file = 'build/terms_to_term_pair_freq.py'

        terms_to_term_pair_freq(if_name, of_name, min_freq=1, max_term_len=20)

    @timing
    def join_terms_files(self, if_names):
        """Join terms files into one

        Args:
            if_names: input terms files names
            args: input arguments
        """
        of_name = '{self.io_dir}/terms_{self.lang}'.format(**locals())
        with open(of_name, 'w') as output_file:
            for if_name in if_names:
                with open(if_name, 'r') as input_file:
                    for line in input_file:
                        output_file.write(line)

        return of_name

    def gen_terms_file(self, if_name, o_list):
        """Generate terms file

        Args:
            if_name: input wiki source file name
            args: input arguments
            o_list: output list saving generated file name
        """
        result = self.remove_symbols_tags(if_name)
        result = self.paragraphs_to_sentences(result)
        of_name = self.sentences_to_terms(if_name, result)
        o_list.append(of_name)

    def thread_job(self, input_file_queue, o_term_files):
        """Job to be done by thread (generate terms file)

        Args:
            input_file_queue: queue containing input files that needs processing
            args: input arguments
            o_term_files: list for outputting generated term file names
        """
        while True:
            if_name = input_file_queue.get()
            if if_name is None:
                break  # end of thread
            self.gen_terms_file(if_name, o_list=o_term_files)
            input_file_queue.task_done()

    @timing
    def main(self):
        """main function"""
        if_list = self.get_source_file_list()

        term_files = []
        input_file_queue = queue.Queue()
        threads = []
        for i in range(self.thread_count):
            t = Thread(target=self.thread_job,
                       args=(input_file_queue, term_files))
            t.start()
            threads.append(t)

        for if_name in if_list:
            input_file_queue.put(if_name)

        # block till all tasks are done (here means all input file processed)
        input_file_queue.join()

        # stop all threads
        for i in range(self.thread_count):
            input_file_queue.put(None)
            # in thread_job, when input_file_queue.get == None, thread will end
        for t in threads:
            t.join()  # wait till all threads really end

        of_name = self.join_terms_files(term_files)
        self.terms_to_term_pairs(of_name)
        self.import2DB()

    def setLang(self, lang):
        self.lang = lang
        self.io_dir = os.path.join(io_dir, self.lang)

    def removeDB(self):
        self.Collect.remove({})

    def import2DB(self):
        result = dict()
        with open(os.path.join(self.io_dir, "{0}.model".format(self.lang)),
                  'r',
                  encoding='utf8') as f:
            for i in f:
                tmp = i.split()
                result.setdefault(tmp[0], []).append([tmp[1], int(tmp[2])])
                result.setdefault(tmp[1], []).append([tmp[0], int(tmp[2])])

        documentArr = [{
            'key': pair[0],
            'value': sorted(pair[1], key=lambda x: -x[1])
        } for pair in result.items()]
        del result

        self.Collect.insert(documentArr)
        self.Collect.create_index([("key", pymongo.HASHED)])

    def get(self, keyword, amount):
        result = self.Collect.find({'key': keyword}, {'_id': False}).limit(1)
        if result.count():
            return {**(result[0]), 'similarity': 1}
        else:
            ngramKeyword = self.kcmNgram.find(keyword)
            if ngramKeyword:
                result = self.Collect.find({
                    'key': ngramKeyword
                }, {
                    '_id': False
                }).limit(1)
                return {
                    'key': ngramKeyword,
                    'value': result[0]['value'][:amount],
                    'similarity': self.kcmNgram.compare(keyword, ngramKeyword)
                }
            return {'key': ngramKeyword, 'value': [], 'similarity': 0}
def get_similar(data, target):
    G = NGram(target)
    return G.find(data)