Ejemplo n.º 1
0
def cwi2(sentences, words, zipf_freq):
    word_freqs = [[[x[0], x[1], x[2],
                    zipf_frequency(x[0], 'en')] for x in y] for y in words]
    for i, wf_sort in enumerate(word_freqs):
        ''' 
        i = 0
        wf_sort = word_freqs[i]
        '''
        for ambg_word in wf_sort:
            '''
            ambg_word = wf_sort[0]
            '''
            if (ambg_word[3] < zipf_freq):
                subtitute = lesk(sentences[i], ambg_word)
                #subtitute = lesk(wf_sort,wf_sort[0])
                if subtitute == 0:
                    continue
                if not subtitute[0][0] > 0:
                    continue
                synset = subtitute[0][1]
                subWord = synset.lemma_names()

                subWord = [[zipf_frequency(x, 'en'), x] for x in subWord]
                subWord = sorted(subWord, reverse=True)
                #sentences[i] = re.sub(wf_sort[0][0], subWord[0][1], sentences[i])
                sentences[i] = re.sub(ambg_word[0], subWord[0][1],
                                      sentences[i])
    return sentences
Ejemplo n.º 2
0
def canUse(candidate, past):
    """
    Check whether a candidate is OK to use.
    """
    candidateFrequency = wordfreq.zipf_frequency(candidate,
                                                 "en",
                                                 wordlist="large")
    candidateRootFrequency = max(
        candidateFrequency,
        wordfreq.zipf_frequency(ps.stem(candidate), "en", wordlist="large"))

    # Reject words that are too infrequent or too frequent (like "a" or "the")
    if candidateFrequency < 2.3 or candidateFrequency > 6:
        return False

    # Mostly, this rejects '#'-containing words
    if not candidate.isalpha():
        return False

    # Is it a bad word?
    if candidate in bad_words:
        return False

    # Now, we check if we've used a related word before.
    if any(map(lambda w: lexicallyRelated(candidate, w), past)):
        return False

    # If we have a relatively infrequent word that is too related to
    # our past three rounds of words, we should reject it.
    if sum(similarityScore(x, candidate)
           for x in past[-6:]) > 2 and candidateRootFrequency < 3.2:
        return False

    # otherwise, we're ok!
    return True
Ejemplo n.º 3
0
def rarest_word():
    f = open(file_name, "r", encoding='utf-8')
    d = enchant.Dict("en_US")

    lowest_freq_m = [8, ""]
    lowest_freq_n = [8, ""]

    for text in f:
        words = text.split(" ")
        for word in words:
            # check that words is a valid english word and that we have a frequency
            if (len(word) > 0 and d.check(word)
                    and wordfreq.zipf_frequency(word, 'en') != 0):
                freq = wordfreq.zipf_frequency(word, 'en')
                text = text[text.find("-") + 2:len(text) - 1]
                if (len(text) > 0):
                    if (text[0] == "E" and freq < lowest_freq_m[0]):
                        lowest_freq_m[0] = freq
                        lowest_freq_m[1] = word
                    elif (text[0] == "L" and freq < lowest_freq_n[0]):
                        lowest_freq_n[0] = freq
                        lowest_freq_n[1] = word

    f.close()
    return (lowest_freq_m, lowest_freq_n)
Ejemplo n.º 4
0
def cwi(sentences, words):
    word_freqs = [[[x[0], x[1], x[2],
                    zipf_frequency(x[0], 'en')] for x in y] for y in words]
    word_freqs_sorted = [
        sorted(y, key=lambda x: x[3], reverse=False) for y in word_freqs
    ]
    #word_freq = word_freqs_sorted[1][0] #####
    #sentence = word_freqs_sorted[0]#####
    for i, wf_sort in enumerate(word_freqs_sorted):
        #subtitute = lesk(word_freqs_sorted[i],word_freqs_sorted[i][0]) ######
        #i=3
        #wf_sort=word_freqs_sorted[i]
        #subtitute = lesk(sentences[i],wf_sort[0])
        subtitute = lesk(sentences[i], wf_sort[0])
        #subtitute = lesk(wf_sort,wf_sort[0])
        if subtitute == 0:
            continue
        if not subtitute[0][0] > 0:
            continue
        #synset = subtitute[0][1].name().split('.')[1:3]
        #subWord = [x[1] for x in subtitute]
        #subWord = [x.name().split('.') for x in subWord]
        #subWord = [x[0] for x in subWord if x[1:3] == synset]

        synset = subtitute[0][1]
        subWord = synset.lemma_names()

        subWord = [[zipf_frequency(x, 'en'), x] for x in subWord]
        subWord = sorted(subWord, reverse=True)
        sentences[i] = re.sub(wf_sort[0][0], subWord[0][1], sentences[i])
    return sentences
Ejemplo n.º 5
0
    def sim(self, w1, w2):
        if w1 not in self.word_to_idx or w2 not in self.word_to_idx:
            return 0

        word_freq_factor = zipf_frequency(w1, "en") * zipf_frequency(w2, "en")
        return (word_freq_factor) * np.dot(self.vectors[self.word_to_idx[w1]],
                                           self.vectors[self.word_to_idx[w2]])
def add_entry(db, title, word):
    lemma = LEMMATIZER.lookup('en', word)[0]
    title = title.lower().split(" (")[0]
    if wordfreq.zipf_frequency(lemma, 'en') < 6 and wordfreq.zipf_frequency(
            word, 'en') < 6:
        db.execute(
            "INSERT OR IGNORE INTO words (page, word, lemma) VALUES (?, ?, ?)",
            (title, word, lemma))
Ejemplo n.º 7
0
def convert_text_to_keywords(text, high_freq, src_lang):
    clean = re.sub(r"[,.'`’'|—;:@#?¿!¡<>_\-\"”“&$\[\]\)\(\\\/]+\ *", " ", text)
    lowerString = clean.lower()
    words = lowerString.split(sep=None)
    print('words c', words)
    keywords = []
    for word in words:
        print('zipf_frequency(word, get_lang_code(src_lang))',
              zipf_frequency(word, get_lang_code(src_lang)))
        print('high_freq', high_freq)
        if (not word.isdigit() and "/" not in word and "\\" not in word
                and len(word) > 1 and
                zipf_frequency(word, get_lang_code(src_lang)) <= high_freq):
            keywords.append(word)
    return keywords
Ejemplo n.º 8
0
def parse_readings(filename, words_traditional, words_simplified):
    for line in read_csv(filename):
        if len(line) != 3 or line[0].startswith("#"):
            continue

        codepoint = line[0]
        fieldname = line[1]
        content = line[2]

        if fieldname not in ("kCantonese", "kMandarin", "kDefinition"):
            continue

        character = chr(int(codepoint[2:], 16))

        entry_added = False

        if character in words_traditional:
            freq = zipf_frequency(character, "zh")

            for entry in words_traditional[character]:
                entry.add_freq(freq)
                if fieldname == "kCantonese":
                    entry.add_jyutping(content)
                elif fieldname == "kMandarin":
                    pin = convert_pinyin_to_tone_numbers(content, character)
                    entry.add_pinyin(pin)
                elif fieldname == "kDefinition":
                    entry.add_defs([("", x.strip())
                                    for x in content.split(";")])

            entry_added = True

        if character in words_simplified:
            # Ignore simplified characters
            entry_added = True

        if not entry_added:
            trad = simp = character
            freq = zipf_frequency(trad, "zh")
            jyut = content if fieldname == "kCantonese" else ""
            pin = (convert_pinyin_to_tone_numbers(content, trad)
                   if fieldname == "kMandarin" else "")
            defs = ([("", x.strip()) for x in content.split(";")]
                    if fieldname == "kDefinition" else [])

            entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
            words_traditional[trad].append(entry)
            words_simplified[simp].append(entry)
Ejemplo n.º 9
0
def parse_same_word_file(filename, words):
    for line in read_csv(filename):
        if len(line) != 2 or line[0] == "詞彙":
            continue

        trad = line[0]
        simp = HanziConv.toSimplified(trad)
        pin = lazy_pinyin(
            trad,
            style=Style.TONE3,
            neutral_tone_with_five=True,
        )
        pin = " ".join(pin).lower()
        pin = pin.strip().replace("v", "u:")
        jyut = pinyin_jyutping_sentence.jyutping(trad,
                                                 tone_numbers=True,
                                                 spaces=True)
        freq = zipf_frequency(trad, "zh")
        defs = [
            objects.DefinitionTuple("​".join(jieba.cut(line[1])), "臺陸用法和差異",
                                    [])
        ]

        entry = objects.Entry(trad, simp, pin, jyut, freq=freq, defs=defs)
        words.add(entry)
def wordfreqs(text):
    freqs = []
    for tok in wordfreq.tokenize(text, 'en'):
        freq = wordfreq.zipf_frequency(tok, 'en')
        if freq != 0:
            freqs.append(freq)
    return np.array(freqs)
Ejemplo n.º 11
0
def show_frequencies():
    show_info_text()
    if text_type.get() == 'youtube':
        createTextFileFromYoutube()
    source_text = None
    source_text = get_source_text()
    if source_text:
        source_text = source_text.lower()
        excluded_words = []
        word_frequency_dictionary = dict()
        clean = re.sub(r"[,.'`’'|—;:@#?¿!¡<>_\-\"”“&$\[\]\)\(\\\/]+\ *", " ",
                       source_text)
        words = clean.split()
        for word in words:
            if word_excluded(word):
                if not word in excluded_words:
                    excluded_words.append(word)
            elif not word.isdigit():
                word_frequency_dictionary[word] = round(
                    zipf_frequency(word,
                                   get_lang_code(str(src_language.get()))), 1)
        for x in np.arange(0, 10, 0.1):
            x = 10 - x
            clean_x = round(x, 1)
            words_with_x_freq = []
            for word, freq in word_frequency_dictionary.items():
                if freq == x:
                    words_with_x_freq.append(word)
            if words_with_x_freq:
                printtk(str(clean_x))
                printtk(str(words_with_x_freq))
        if excluded_words:
            printtk('EXCLUDED:')
            printtk(str(excluded_words))
Ejemplo n.º 12
0
def get_bert_candidates(input_text,
                        list_cwi_predictions,
                        numb_predictions_displayed=10):
    list_candidates_bert = []
    for word, pred in zip(input_text.split(), list_cwi_predictions):
        if (pred and
            (pos_tag([word])[0][1] in ['NNS', 'NN', 'VBP', 'RB', 'VBG', 'VBD'])
            ) or (zipf_frequency(word, 'en')) < 3.1:
            replace_word_mask = input_text.replace(word, '[MASK]')
            text = f'[CLS]{replace_word_mask} [SEP] {input_text} [SEP] '
            tokenized_text = tokenizer.tokenize(text)
            masked_index = [
                i for i, x in enumerate(tokenized_text) if x == '[MASK]'
            ][0]
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            segments_ids = [0] * len(tokenized_text)
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])
            # Predict all tokens
            with torch.no_grad():
                outputs = model(tokens_tensor, token_type_ids=segments_tensors)
                predictions = outputs[0][0][masked_index]
            predicted_ids = torch.argsort(
                predictions, descending=True)[:numb_predictions_displayed]
            predicted_tokens = tokenizer.convert_ids_to_tokens(
                list(predicted_ids))
            list_candidates_bert.append((word, predicted_tokens))
    return list_candidates_bert
Ejemplo n.º 13
0
def add_zipf_frequency(word_count):
    words_stats = []

    for w, c in word_count.items():
        freq = zipf_frequency(w, "en")
        words_stats.append(WordStats(w, c, freq))

    return words_stats
Ejemplo n.º 14
0
def get_tag_TF(tag_num, tag2id, id2tag):
    ret = [0.0] * tag_num
    for key in tag2id:
        ret[tag2id[key]] = zipf_frequency(key, "en")
        # print (key, ret[tag2id[key]])

    print("get tag tf over!")
    return ret
Ejemplo n.º 15
0
    def choose_word(words: list):
        toReplace = {}  # k=index in words, v= translated word
        current_min = 10
        chosen_index = 0
        translator = Translator()
        for word in words:
            translated = translator.translate(word).text
            if zipf_frequency(translated, 'en') == 0:  # handle invalid words
                toReplace[translated] = 10
            else:
                toReplace[translated] = zipf_frequency(translated, 'en')
            if toReplace[translated] < current_min:
                current_min = toReplace[translated]
                chosen_index = words.index(word)
        chosen_word = min(toReplace, key=toReplace.get)

        return {chosen_index: chosen_word}
Ejemplo n.º 16
0
 def examine(s, szf, do_implicits=False, do_explicits=True):
     explicits = []
     if do_explicits:
         explicits = re.findall("#\w+", s)
         explicits = [a.replace("#", "").lower() for a in explicits]
     implicits = []
     if do_implicits:
         ss = "\n".join(
             list(
                 filter(
                     lambda x: "#" not in x and "_" not in x and not re.match("'.+'", x), 
                     s.split("\n")
                 )
             )
         )
         spl = list(
             filter(
                 lambda x: re.match(
                     "\w+", x
                 ) and x not in CatIndex.en_stops and x not in CatIndex.ru_stops and len(x) > 2, 
                 re.split("\s", ss)
             )
         )
         for a in spl:
             ru = CatIndex.ru.parse(a)
             for b in ru:
                 for c in CatIndex.ru_linkable:
                     if str(b.tag).startswith(c):
                         zf = wordfreq.zipf_frequency(a, 'ru')
                         if zf <= szf:
                             implicits.append(a)
             try:
                 en = nltk.pos_tag([a])
             except:
                 pass
             else:
                 for b in en:
                     for c in CatIndex.en_linkable:
                         if b[1].startswith(c):
                             zf = wordfreq.zipf_frequency(a, 'en')
                             if zf <= szf:
                                 implicits.append(a)
     r = {
         "explicits": explicits, "implicits": implicits
     }
     return(r)
Ejemplo n.º 17
0
def make_dataframe(filename):
    # Read pandas from csv data
    init = pd.read_csv(filename)

    # Detect terminal width for dataframe printing
    pd.options.display.width = 0

    # Keep only these three fields... Be careful (names will change after experiment)
    filtered = init[['positive', 'antonym', 'response']]

    # Strip leading/trailing whitespace and lowercase all stimuli and responses
    filtered['response'] = filtered['response'].apply(
        lambda word: word.lower().strip())
    filtered['positive'] = filtered['positive'].apply(
        lambda word: word.lower().strip())

    # Sort by stimuli and then response
    sorted = filtered.sort_values(['positive', 'response'])

    # Add a column for response count
    sorted = sorted.assign(ant_count=sorted.groupby(
        ['positive', 'response']).response.transform('count'))

    # Remove duplicates (because we have a count)
    sorted = sorted.drop_duplicates()

    # Calculate transition probability: specific_antonym.count()/all_antonyms.count()
    sorted = sorted.assign(
        trans_prob=sorted.groupby('positive').transform(lambda x: x / x.sum()))
    sorted = sorted.reset_index(
        drop=True)  # Add bool column for morphological antonyms

    # Add bool column for morphological antonyms
    sorted = sorted.assign(is_morph=sorted['response'] == sorted['antonym'])

    # Add word frequencies for response words
    sorted = sorted.assign(freq_absolute=sorted['response'].apply(
        lambda word: zipf_frequency(word, 'en')))

    # Add relative word frequencies: response_freq - stimuli_freq
    sorted = sorted.assign(
        freq_relative=sorted['freq_absolute'] -
        sorted['positive'].apply(lambda word: zipf_frequency(word, 'en')))

    return sorted
Ejemplo n.º 18
0
def filter_by_freq(word_list, lower_freq_bound, upper_freq_bound):
    """Filters words based on their relative frequency and adds frequency info
    to the wordlist"""
    filtered_word_list = []
    for word in word_list:
        freq = wordfreq.zipf_frequency(word[0], 'zh', wordlist='large', minimum=0.0)
        if lower_freq_bound <= freq <= upper_freq_bound:
            filtered_word_list.append(word)
    return filtered_word_list
Ejemplo n.º 19
0
    def identify_emerging_concepts(self,
                                   sent: Span,
                                   section: Section,
                                   graph: ConceptGraph,
                                   rule_based=True):
        """Identify concepts in a given sentence that are likely to be emerging concepts.

        :param sent: A spaCy span representing a sentence in a document.
        :param section: The section the sentence appears in.
        :param graph: The concept graph to record the emerging concepts in.
        :param rule_based: Flag indicating whether or not to use the rule-based classifier.
        """
        if not rule_based:
            return

        for token in filter(lambda token: token.dep_ == 'ROOT', sent):
            concept_tokens = []

            if token.lemma_ == 'be':
                if len(
                        list(
                            filter(lambda right: right.dep_ == 'attr',
                                   token.rights))) > 0:
                    concept_tokens = filter(
                        lambda left: left.dep_.endswith('subj'), token.lefts)
            elif token.lemma_ == 'define':
                try:
                    concept_tokens = list(
                        filter(lambda left: left.dep_.endswith('subjpass'),
                               token.lefts))
                except StopIteration:
                    concept_tokens = list(
                        filter(lambda right: right.dep_ == 'dobj',
                               token.rights))
            elif token.lemma_ == 'call':
                concept_tokens = list(
                    filter(lambda right: right.dep_ == 'oprd', token.rights))

            tokens = []

            for token in concept_tokens:
                tokens += token.subtree

            if len(tokens) > 0:
                if tokens[0].tag_ == 'DT':
                    tokens = tokens[1:]

                tokens = filter(lambda token: len(token.text.strip()) > 0,
                                tokens)
                node = Node(' '.join(map(lambda token: token.text, tokens)))

                if node != '' and zipf_frequency(
                        node, 'en') < self.emerging_concept_frequency_cutoff:
                    if node not in graph.nodes:
                        graph.add_node(node, section)

                    graph.emerging_concepts.add(node)
Ejemplo n.º 20
0
def get_words(word, pos, hyper_hypo, recursive=False, depth=None):
    """ return a dict { str.lower : zipf_freq.log }
    if depth = NOne, depth = 1
    """
    ans = {}  # a dictioinary
    # find out which synset for the word
    for ss in wordnet.synsets(word, pos):
        # if ss.pos.startswith(pos.upper()):  # this is what we want!
        if hyper_hypo == 'hyper':
            for x in ss.hypernyms(recursive, depth):
                ans.update({syn.replace('_', ' ') : zipf_frequency(syn.replace('_', ' '), 'en') \
                            for syn in x.synonyms})
        elif hyper_hypo == 'hypo':
            for x in ss.hyponyms(recursive, depth):
                ans.update({syn.replace('_', ' ') : zipf_frequency(syn.replace('_', ' '), 'en') \
                            for syn in x.synonyms})
        # break  # only use the first synset, assuming it's the most common one
    return ans
Ejemplo n.º 21
0
def add_freq(word_list, add_freq_to_output):
    """ Adds frequencies to output """
    if not add_freq_to_output:
        return word_list
    for word in word_list:
        freq = wordfreq.zipf_frequency(word[0], 'zh', wordlist='large',
                minimum=0.0)
        word.append(freq)

    return word_list
Ejemplo n.º 22
0
    def predict_next_word(self, prompt, swype):
        """

        Parameters
        ----------
        prompt: str
            previous text
        swype: nuvox.swype.Swype
            swype object - containing key_trace needed for prediction.
            passing in entire object so that attributes can be updated for analytics.

        Returns
        -------
        ranked_suggestions: list[str]
            ranked list of other suggested words
        """
        swype.key_trace = self.remove_blacklisted_keys(swype.key_trace)
        key_trace = copy.copy(swype.key_trace)

        if not key_trace:
            return []

        # Phase 0 - check if punctuation key was selected
        intended_punctuation = self.get_intended_punctuation(key_trace)
        if intended_punctuation:
            swype.word_to_trace_prob = swype.word_to_language_prob = swype.word_to_joint_prob = {intended_punctuation: 1.0}
            return [intended_punctuation]

        # Phase 1) Get dict mapping word --> prob(word | trace) for all possibly intended words using trace algorithm
        word_to_trace_prob = self.trace_algorithm.get_possible_word_to_trace_prob(key_id_sequence=key_trace)
        swype.word_to_trace_prob = word_to_trace_prob  # store in swype obj for analytics
        candidate_words = list(word_to_trace_prob)

        # Phase 2) Filter the list of candidates based on their frequency in the english language
        candidate_words = list(sorted(candidate_words, key=lambda word: zipf_frequency(word, 'en'), reverse=True))[:self.config.MAX_SUGGESTIONS]

        # Phase 3) Get dict mapping word --> prob(word | prompt) all possibly intended words using language model
        word_to_language_prob = self.language_model.get_candidate_word_probs(prompt,
                                                                             candidate_words=candidate_words,
                                                                             normalize=True)
        swype.word_to_language_prob = word_to_language_prob  # store in swype obj for analytics

        # Phase 3) Get dict mapping word --> prob(word | trace) * prob(word | prompt) (i.e. the joint probability)
        # TODO - need some sort of scaling factor to control influence of each model
        w = 0.75  # relative weight on the trace probability vs language model prob
        word_to_joint_prob = {word: ((w * word_to_trace_prob[word]) + ((1-w) * word_to_language_prob[word]))
                              for word in candidate_words}
        swype.word_to_joint_prob = word_to_joint_prob  # store in swype obj for analytics

        ranked_suggestions = sorted(word_to_joint_prob.keys(), key=lambda k: word_to_joint_prob.get(k, 0), reverse=True)

        if self.need_to_capitalize(prompt):
            ranked_suggestions = [word.capitalize() for word in ranked_suggestions]

        return ranked_suggestions
Ejemplo n.º 23
0
def cal_gram_med(SLD, n):
    """
    计算字符串n元频率中位数
    :param SLD: 字符串
    :param n: n
    :return:
    """
    grams = [SLD[i:i + n] for i in range(len(SLD) - n+1)]
    fre = list()
    for s in grams:
        fre.append(wordfreq.zipf_frequency(s, 'en'))
    return np.median(fre)
Ejemplo n.º 24
0
def get_features(token):
    token = token.replace('<EOS>', '')
    return pd.Series({
        'length':
        len(token),
        'logfreq':
        wordfreq.zipf_frequency(token, 'en'),
        'has_upper':
        0 if token.lower() == token else 1,
        'has_punct':
        1 if any(j in string.punctuation for j in token) else 0,
    })
Ejemplo n.º 25
0
def get_src_words_and_phrases(str,low_freq,high_freq,src_langcode,splitters):
	src_list = dict()
	rejected_words = []
	current_episode = 'ep1'
	episode_count = 0
	clean = re.sub(r"[,.;@#?¿!¡\-\"&$\[\]]+\ *", " ", str)
	lowerString = clean.lower()
	words = lowerString.split()

	for word in words:
		word.strip()
		if word in splitters:
			current_episode = word
			episode_count += 1
		else:
			if (not word.isdigit() and 
				word not in rejected_words):
					if word in src_list:
						if current_episode not in src_list[word]:
							src_list[word].append(current_episode)
					else:
						word_src_freq = zipf_frequency(word, src_langcode)
						if word_src_freq < high_freq:
							if word_src_freq > low_freq:
								src_list[word] = [current_episode]
								if print_added == True:
									print('word added', word)
							else:
								rejected_words.append(word)
								if print_rejected_words == True:
									print('							word too rare', word)
						else:
							rejected_words.append(word)
							if print_rejected_words == True:
								print('				word too common', word)
	current_episode = 'ep1'
	clean_phrases = re.sub(r"[;@#\"&$\[\]]+\ *", " ", str)
	clean_phrases_punct = re.sub(r"[,.;@#?¿!¡\-\"&$]+\ *", ".", clean_phrases)
	phrases = clean_phrases_punct.split(".")
	for phrase in phrases:
		if phrase in splitters:
			current_episode = phrase
		else:
			if len(phrase.split()) > 1 and phrase:
				if phrase in src_list:
					if current_episode not in src_list[phrase]:
						src_list[phrase].append(current_episode)
				else:
					if print_added == True:
						print('phrase added', phrase)
					src_list[phrase] = [current_episode]
	sortedDict = sorted(src_list.items(), key=lambda x: x[1])
	return episode_count, sortedDict
Ejemplo n.º 26
0
def add_to_tokens_if_not_exists(parsed_token):
    exists = False
    for result_token in result_tokens:
        if parsed_token == result_token:
            exists = True
            break
    if exists == False:  #Part four:Choose token based on frequency Or search and give score based on frequency
        freq = zipf_frequency(parsed_token, 'fa')
        if freq < 6:
            result_tokens.append(parsed_token)
        else:
            less_accurate_tokens.append(parsed_token)
Ejemplo n.º 27
0
def disambiguate(sentence,
                 algorithm=simple_lesk,
                 context_is_lemmatized=False,
                 similarity_option='path',
                 keepLemmas=False,
                 prefersNone=True,
                 zipf=5):
    tagged_sentence = []
    # Pre-lemmatize the sentnece before WSD
    if not context_is_lemmatized:
        surface_words, lemmas, morphy_poss = lemmatize_sentence(
            sentence, keepWordPOS=True)
        lemma_sentence = " ".join(lemmas)
    else:
        lemma_sentence = sentence  # TODO: Miss out on POS specification, how to resolve?
    for word, lemma, pos in zip(surface_words, lemmas, morphy_poss):
        if lemma not in stopwords:  # Checks if it is a content word
            if zipf_frequency(lemma, 'en') < zipf:
                try:
                    wn.synsets(lemma)[0]
                    if algorithm == original_lesk:  # Note: Original doesn't care about lemmas
                        synset = algorithm(lemma_sentence, lemma)
                    elif algorithm == max_similarity:
                        synset = algorithm(lemma_sentence,
                                           lemma,
                                           pos=pos,
                                           option=similarity_option)
                    else:
                        synset = algorithm(lemma_sentence,
                                           lemma,
                                           pos=pos,
                                           context_is_lemmatized=True)
                except:  # In case the content word is not in WordNet
                    synset = '#NOT_IN_WN#'
            else:
                synset = '#under_zipf#'
        else:
            synset = '#STOPWORD/PUNCTUATION#'
        if keepLemmas:
            tagged_sentence.append((word, lemma, synset))
        else:
            tagged_sentence.append((word, synset))
    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    if prefersNone and not keepLemmas:
        tagged_sentence = [(word, None) if str(tag).startswith('#') else
                           (word, tag) for word, tag in tagged_sentence]
    if prefersNone and keepLemmas:
        tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else
                           (word, lemma, tag)
                           for word, lemma, tag in tagged_sentence]
    return tagged_sentence
Ejemplo n.º 28
0
def get_translation(src_text, dest_langcode, src_langcode):
	#translator = Translator(from_lang=src_langcode,to_lang=dest_langcode)

	# print('get_translation')
	# dest_text_plain = translator.translate(src_text)
	# print('dest_text_plain',dest_text_plain)
	# dest_text = str(translator.translate(src_text, dest=dest_langcode, src=src_langcode).text)
	# print('dest_text',dest_text)
	#dest_text = translator.translate(src_text)
	dest_text = translate(src_text, dest_langcode, src_langcode)
	#translated = GoogleTranslator(source='auto', target='de').translate("keep it up, you are awesome")
	print('dest_text',dest_text)
	word_src_freq = 0
	word_dest_freq = 0
	should_make_note = True
	if src_text == dest_text or dest_text == '':
		translation_attempt = 1#look into using turkey vpn instead of this sleep
		word_src_freq = zipf_frequency(src_text, src_langcode)
		word_dest_freq = zipf_frequency(src_text, dest_langcode)
		if word_src_freq >= word_dest_freq:
			while translation_attempt < 5:
				time.sleep(translation_attempt)
				dest_text = translate(src_text, dest_langcode, src_langcode)
				if src_text == dest_text:
					translation_attempt += translation_attempt
				else:
					translation_attempt = 5
		else:
			should_make_note = False
			print('rejected because more common in dest:', src_text, word_src_freq,word_dest_freq )

	if dest_text != '':	
		add_translation_to_local_dictionary(src_text, dest_text, dest_langcode, src_langcode)

	if not should_make_note:
		dest_text = ''
	return dest_text
Ejemplo n.º 29
0
    def _categorise_nodes(self, graph_based=True):
        """Categorise nodes in the graph into 'a priori' and 'emerging' concepts.

        Nodes that are only referenced from one section represent 'a priori references',
        all other nodes represent 'emerging concepts'.

        :param graph_based: Flag indicating whether or not to use the graph-based classification method.
        """
        if graph_based:
            for section in self.sections:
                for node in self.section_listings[section]:
                    # Skip concepts that have already been categorised during parsing.
                    if node in self.emerging_concepts:
                        continue

                    referencing_sections = set()

                    for tail in self.adjacency_index[node]:
                        tail_section = self.section_index[tail]
                        referencing_sections.add(tail_section)

                    if len(referencing_sections) > 1 and zipf_frequency(node,
                                                                        'en') < self.emerging_concept_frequency_cutoff:
                        self.emerging_concepts.add(node)
                    else:
                        self.a_priori_concepts.add(node)

        # Enforce transitivity of concept labels such that all concepts that contain an emerging concept as a
        # constituent are also labelled as emerging concepts.
        for node in self.nodes:
            found = False

            for token in node.split(' '):
                for other in self.emerging_concepts:
                    if token in other.split(' '):
                        self.a_priori_concepts.discard(node)
                        self.emerging_concepts.add(node)
                        found = True

                    if found:
                        break

                if found:
                    break

            # Need to add a priori concepts here if we are not using the graph-based classifier
            # (i.e. we are using the rule-based classifier)
            if not graph_based and not found:
                self.a_priori_concepts.add(node)
Ejemplo n.º 30
0
def generate_wordlist(n: int) -> List[Tuple[str, str]]:
    wiki = WiktionarySearcher()
    words = []
    while len(words) < n:
        try:
            word = wiki.generate_word()
        except Exception as e:
            continue
        word, meanings = word.text, word.meanings
        if not meanings:
            continue
        freq = zipf_frequency(word, 'ru')
        if freq <= 1:
            words.append((word, random.choice(meanings)))
    return words
Ejemplo n.º 31
0
import html
import sys
import wordfreq

if len(sys.argv) != 3:
    print('Usage: python3 sort.py target-lang pairs.csv')
    sys.exit(1)

targetLang = sys.argv[1]
pairsPath = sys.argv[2]

pairs = {}

with open(pairsPath, 'r', encoding='utf-8') as pairsFile:
    reader = csv.reader(pairsFile, delimiter='\t')
    for row in reader:
        words = wordfreq.tokenize(html.unescape(row[0]), targetLang)

        freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined')
                     for word in words]

        minfreq = min(freqs)
        avgfreq = sum(freqs) / float(len(freqs))
        pairs[row[0]] = (minfreq, avgfreq, row[1])

pairList = list(pairs.items())
pairList.sort(reverse = True, key=lambda i: i[1])

for pair in pairList:
    sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))