Exemple #1
0
def calc_common_word_ngram(text_sentence, question):
    common = 0

    # Noktalama isaretlerinden temizlendi
    text_sentence = remove_punctuation(text_sentence)
    question = remove_punctuation(question)

    # Kucuk harf ve kelime kelime parcalama islemi
    text_sentence = tr_lower(text_sentence).strip()
    question = tr_lower(question).strip()

    n = NGram(N=N_GRAM)
    list_text_sentence = list(n.split(text_sentence))
    list_question = list(n.split(question))

    # print(list_text_sentence)
    # print()
    # print(list_question)
    # input('')

    for question_word in list_question:
        if question_word in list_text_sentence:
            # print(question_word)
            common += 1

    return common
Exemple #2
0
class NGramMixSegment(Segment):
    def __init__(self, N=(1, 2), pad_len=0):
        self.model1 = NGram(N=N[0], pad_len=pad_len)
        self.model2 = NGram(N=N[1], pad_len=pad_len)

    def cut(self, sentence):
        """ 分词.

            @params:
                sentence - 待分词文本.

            @return:
                On success - 单词列表.
                On failure - 错误信息.
        """
        words = list(self.model1.split(sentence))
        words.extend(list(self.model2.split(sentence)))
        return words
Exemple #3
0
class NGramSegment(Segment):
    def __init__(self, N=2, pad_len=0):
        self.model = NGram(N=N, pad_len=pad_len)

    def cut(self, sentence):
        """ 分词.

            @params:
                sentence - 待分词文本.

            @return:
                On success - 单词列表.
                On failure - 错误信息.
        """
        return list(self.model.split(sentence))
Exemple #4
0
print 'jaro winkler', sim_arry6
sim_arry7 = [
    jellyfish.match_rating_comparison(unicode(string[0]), unicode(s))
    for s in string
]
print 'match rating comparison', sim_arry7
# tokens = word_tokenize([string])
# print(string_token)
# print tfidf_matrix

# print(y.toarray()
ngram_array = [word_grams(s.split(' ')) for s in string]
# print ngram_array
n = NGram()
# print list(n.split(string[0]))
ngram_array = [list(n.split(s)) for s in string]
# print ngram_array
sim_arry8 = [NGram.compare(string[0].lower(), s.lower(), N=4) for s in string]
print 'ngram', sim_arry8


def jaccard_distance(a, b):
    # print a, b
    inter_len = float(len(list(a.intersection(b))))
    union_len = float(len(list(a.union(b))))
    return inter_len / union_len


# print list(ngram_array[0].intersection(ngram_array[1]))
sim_arry9 = [
    jaccard_distance(NGram(ngram_array[0]), NGram(s)) for s in ngram_array
Exemple #5
0
def ngram_str(s, N=3):
    s = s.replace(" ", "_")
    ngram = NGram(N=N)
    return " ".join(ngram.split(s))
class NgramIndex():
    """
    Class used for encoding words in ngram representation
    """
    def __init__(self,n,loaded = False):
        """
        Constructor
        
        Parameters
        ----------
        n : int
            ngram size
        """
        self.ngram_gen = NGram(N=n)

        self.size = n
        self.ngram_index = {"":0}
        self.index_ngram = {0:""}
        self.cpt = 0
        self.max_len = 0

        self.loaded = loaded
    def split_and_add(self,word):
        """
        Split word in multiple ngram and add each one of them to the index
        
        Parameters
        ----------
        word : str
            a word
        """
        ngrams = word.lower().replace(" ","$")
        ngrams = list(self.ngram_gen.split(ngrams))
        [self.add(ngram) for ngram in ngrams]
        self.max_len = max(self.max_len,len(ngrams))

    def add(self,ngram):
        """
        Add a ngram to the index
        
        Parameters
        ----------
        ngram : str
            ngram
        """
        if not ngram in self.ngram_index:
            self.cpt+=1
            self.ngram_index[ngram]=self.cpt
            self.index_ngram[self.cpt]=ngram
        

    def encode(self,word):
        """
        Return a ngram representation of a word
        
        Parameters
        ----------
        word : str
            a word
        
        Returns
        -------
        list of int
            listfrom shapely.geometry import Point,box
 of ngram index
        """
        ngrams = word.lower().replace(" ","$")
        ngrams = list(self.ngram_gen.split(ngrams))
        return [self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index]

    def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
        """
        Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
        
        Parameters
        ----------
        ngram_encoding : list of int
            first encoding of a word
        MAX_LEN : int
            desired length of the encoding
        filling_item : int, optional
            ngram index you wish to use, by default 0
        
        Returns
        -------
        list of int
            list of ngram index
        """
        if self.loaded and len(ngram_encoding) >=MAX_LEN:
            return ngram_encoding[:MAX_LEN]
        assert len(ngram_encoding) <= MAX_LEN
        diff = MAX_LEN - len(ngram_encoding)
        ngram_encoding.extend([filling_item]*diff)  
        return ngram_encoding

    def save(self,fn):
        """

        Save the NgramIndex
        
        Parameters
        ----------
        fn : str
            output filename
        """
        data = {
            "ngram_size": self.size,
            "ngram_index": self.ngram_index,
            "cpt_state": self.cpt,
            "max_len_state": self.max_len
        }
        json.dump(data,open(fn,'w'))

    @staticmethod
    def load(fn):
        """
        
        Load a NgramIndex state from a file.
        
        Parameters
        ----------
        fn : str
            input filename
        
        Returns
        -------
        NgramIndex
            ngram index
        
        Raises
        ------
        KeyError
            raised if a required field does not appear in the input file
        """
        try:
            data = json.load(open(fn))
        except json.JSONDecodeError:
            print("Data file must be a JSON")
        for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
            if not key in data:
                raise KeyError("{0} field cannot be found in given file".format(key))
        new_obj = NgramIndex(data["ngram_size"],loaded=True)
        new_obj.ngram_index = data["ngram_index"]
        new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
        new_obj.cpt = data["cpt_state"]
        new_obj.max_len = data["max_len_state"]
        return new_obj