def query_words(query):
    words = nltk.word_tokenize(query)
    res = []
    for w in words:
        if (is_word(w.lower()) and (w.lower() not in stops)):
            res.append(w.lower())
    return res
Beispiel #2
0
def segment_passwd(passwd):
    res = []
    for i in range(len(passwd)):
        for j in range(i + 1, len(passwd) - 1):
            word = passwd[i:j]
            if utils.is_word(word):
                res.append(word)
                if j < len(passwd):
                    res.append(segment_passwd(passwd[j:len(passwd)]))
                else:
                    return res
            else:
                continue
    return res
Beispiel #3
0
def word_search(tree, string=''):
    """
    Exercise 2: Implement a word search using either DFS or BFS.
    Why did you choose one algorithm or the other?
    """
    if tree is None:
        return []

    string += tree.value

    results = []
    if is_word(string):
        results.append(string)

    results += word_search(tree.left, string)
    results += word_search(tree.right, string)

    return results
Beispiel #4
0
def word_search(tree, string=''):
    """
    Exercise 2: Implement a word search using either DFS or BFS.
    Why did you choose one algorithm or the other?
    """
    if tree is None:
        return []

    string += tree.value

    results = []
    if is_word(string):
        results.append(string)

    results += word_search(tree.left, string)
    results += word_search(tree.right, string)

    return results
Beispiel #5
0
    def make_search_filter():
        if not is_term_list:
            return ''

        str = search_string.strip()
        if not str:
            return ''

        params['$searchString'] = str
        if exact_match: return 'eq(text, $searchString)'

        # too small word fails with 'regular expression is too wide-ranging and can't be executed efficiently'
        use_regexp = is_word(str) and len(str) >= 3

        if use_regexp:
            params['$regexp'] = "{0}.*".format(str)

        regexp = "regexp(text, /$regexp/i)" if use_regexp else ''
        anyoftext = "anyoftext(text, $searchString)"
        exprs = [s for s in [anyoftext, regexp] if len(s) > 0]
        if len(exprs) > 1:
            return "({0})".format(' or '.join(exprs))
        return exprs[0]
def dimension_fill(tok, table_name, seuil=0.5):
    dim_dict = dict[table_name][0]  #dimensions and values of the table
    dim_default = dict[table_name][
        1]  #default value (if any) for each dimension

    final_dict = {}  #final result
    non_trivial_dim = []

    # From the tokens, we only keep the keywords, and put them in lower case
    low_tok = lower_list(tok)
    lowered_tok = []
    for t in low_tok:
        if t not in avoid_words:
            lowered_tok.append(t)

    #First, we try to see if some dimensions are trivial (only one possible valueà
    for a in dim_dict:
        if len(dim_dict[a]) == 1:
            final_dict[a] = dim_dict[a][0][0]
        else:
            non_trivial_dim.append(a)

    #For the non trivial dimensions :
    for d in non_trivial_dim:

        #We will try each value and give it a score, the value with the best score will be kept at the end
        values = dim_dict[d]
        scores = []

        for v in values:
            #we put the text of the value to the right format (tokenized, no hyphen)
            text = v[1]
            text = text.replace("-", " ")
            text_tok = nltk.word_tokenize(text)

            #and only keep relevant words (keywords)
            words = []
            for w in text_tok:
                if (w.lower() not in stops and is_word(w.lower())):
                    words.append(w.lower())

            #and then we compute the score
            n = len(words)
            s = 0
            #the score for each word of the name is the maximal proximity between this word and the words of the query
            for w in words:
                m = 0
                for t in lowered_tok:
                    pr = proximity(w, t)
                    m = max(m, pr)
                s += m
            #and we divide by the number of words in the name
            if n > 0:
                s = s / n
            else:
                s = 0
            scores.append(s)
        #we take the value with the highest score, and make sure it is high enough (above a certain threshold)
        idx = id_max(scores)
        score_max = scores[idx]
        if score_max > seuil:
            final_dict[d] = dim_dict[d][idx][0]
        #else, we go for the default value
        else:
            if dim_default[d] != None:
                final_dict[d] = dim_default[d]
            else:
                final_dict[d] = None

    return final_dict
Beispiel #7
0
def get_data(text, lang):
    if lang != 'en':
        return

    txt = text.replace(' ', '-')
    url = f'{base}/dictionary/english/{txt}'

    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    codes = {
        'C': 'countable',
        'U': 'uncountable',
        'S': 'singular',
    }
    posgram_found = False
    gram_found = False

    if utils.is_word(text):
        yield ('tag', Term(text='word', lang=lang, region=None))

    soup = BeautifulSoup(resp.text, 'html.parser')
    page = soup.find('div', class_='page')
    for dictionary in page.find_all('div', class_='dictionary'):
        header = dictionary.find('div', class_='pos-header')
        body = dictionary.find('div', class_='pos-body')

        posgram = header.find('div', class_='posgram')
        if posgram and not posgram_found:
            pos = find_strip(posgram, 'span', class_='pos')
            term = Term(text=pos, lang=lang, region=None)
            yield ('tag', term)
            posgram_found = True
        if not gram_found:
            for gram in body.find_all('span', class_='gram'):
                for gc in gram.find_all('span', class_='gc'):
                    code = stripped_text(gc)
                    if code in codes and not gram_found:
                        term = Term(text=codes[code], lang=lang, region=None)
                        yield ('tag', term)
                        gram_found = True

        # parse pronunciations
        for dpron in header.find_all('span', class_='dpron-i'):
            region = find_strip(dpron, 'span', 'region')
            amp = header.find('amp-audio')
            for source in amp.find_all('source'):
                file = File(url=base + source.attrs['src'], region=region)
                yield ('audio', file)

            ipa = find_strip(dpron, 'span', class_='ipa')
            if not is_empty(ipa):
                yield ('transcription', Term(text=ipa,
                                             lang=lang,
                                             region=region))

        for dblock in body.find_all('div', class_='def-block'):
            def_text = stripped_text(dblock.find('div', class_='def'))
            if not is_empty(def_text):
                yield ('definition', Term(text=def_text,
                                          lang=lang,
                                          region=None))
            img = dblock.find('amp-img')
            if img is not None:
                file = File(url=base + img.attrs['src'], region=None)
                yield ('visual', file)
            for eg in dblock.find_all('span', 'eg'):
                term = Term(text=stripped_text(eg), lang=lang, region=None)
                yield ('in', term)

    for dataset in page.find_all('div', class_='dataset'):
        for eg in dataset.find_all('span', class_='deg'):
            term = Term(text=stripped_text(eg), lang=lang, region=None)
            yield ('in', term)
        cpegs = dataset.find('div', class_='cpegs')
        if cpegs:
            for lbb in cpegs.find_all('div', class_='lbb'):
                for a in lbb.find_all('a', class_='hdib'):
                    term = Term(text=stripped_text(a), lang=lang, region=None)
                    yield ('collocation', term)

    for t in get_translations(text, lang):
        yield t
def keyword(url):
    # loading the xml structure as a tree
    file = urlopen(url)
    tree = ET.parse(file)
    root = tree.getroot()
    header = root[0]
    structure = root[1]

    # getting the different parts of the metadata
    concepts = get_concepts(structure)
    info = get_info(structure)
    cat = get_category(structure)
    dim = get_dimensions(structure)

    cl = get_codelists(structure)
    cons = get_constaints(structure)

    # The 4 categories
    Name = []
    Description = []
    Dimensions = []
    Category = []


    # Name keywords
    try :
        n = info["Name"]
        for w in nltk.word_tokenize(n):
            if (w.lower() not in stops and is_word(w.lower())):
                Name.append(w.lower())
    except :
        pass

    # Description keywords
    try :
        d = info["Description"]
        for w in nltk.word_tokenize(d):
            if (w.lower() not in stops and is_word(w.lower())):
                Description.append(w.lower())
    except :
        pass

    # Category keywords
    for c in cat:
        for w in nltk.word_tokenize(c):
            if (w.lower() not in stops and is_word(w.lower())):
                Category.append(w.lower())

    # Dimension keywords
    for a in dim:
        conc = concepts[dim[a][1]] #concept associated
        if (a not in avoid_dim):
            for w in nltk.word_tokenize(conc):      #words from the title of the dimension
                if (w.lower() not in stops and is_word(w.lower())):
                    Dimensions.append(w.lower())
            for c in cons[a]:                       #words from the values of the dimension
                for d in cl[dim[a][2]]:
                    if d[0] == c:
                        for w in nltk.word_tokenize(d[1]):
                            if (w.lower() not in stops and is_word(w.lower())):
                                Dimensions.append(w.lower())

    return (set(Name),set(Description),set(Dimensions),set(Category))