def query_words(query): words = nltk.word_tokenize(query) res = [] for w in words: if (is_word(w.lower()) and (w.lower() not in stops)): res.append(w.lower()) return res
def segment_passwd(passwd): res = [] for i in range(len(passwd)): for j in range(i + 1, len(passwd) - 1): word = passwd[i:j] if utils.is_word(word): res.append(word) if j < len(passwd): res.append(segment_passwd(passwd[j:len(passwd)])) else: return res else: continue return res
def word_search(tree, string=''): """ Exercise 2: Implement a word search using either DFS or BFS. Why did you choose one algorithm or the other? """ if tree is None: return [] string += tree.value results = [] if is_word(string): results.append(string) results += word_search(tree.left, string) results += word_search(tree.right, string) return results
def make_search_filter(): if not is_term_list: return '' str = search_string.strip() if not str: return '' params['$searchString'] = str if exact_match: return 'eq(text, $searchString)' # too small word fails with 'regular expression is too wide-ranging and can't be executed efficiently' use_regexp = is_word(str) and len(str) >= 3 if use_regexp: params['$regexp'] = "{0}.*".format(str) regexp = "regexp(text, /$regexp/i)" if use_regexp else '' anyoftext = "anyoftext(text, $searchString)" exprs = [s for s in [anyoftext, regexp] if len(s) > 0] if len(exprs) > 1: return "({0})".format(' or '.join(exprs)) return exprs[0]
def dimension_fill(tok, table_name, seuil=0.5): dim_dict = dict[table_name][0] #dimensions and values of the table dim_default = dict[table_name][ 1] #default value (if any) for each dimension final_dict = {} #final result non_trivial_dim = [] # From the tokens, we only keep the keywords, and put them in lower case low_tok = lower_list(tok) lowered_tok = [] for t in low_tok: if t not in avoid_words: lowered_tok.append(t) #First, we try to see if some dimensions are trivial (only one possible valueà for a in dim_dict: if len(dim_dict[a]) == 1: final_dict[a] = dim_dict[a][0][0] else: non_trivial_dim.append(a) #For the non trivial dimensions : for d in non_trivial_dim: #We will try each value and give it a score, the value with the best score will be kept at the end values = dim_dict[d] scores = [] for v in values: #we put the text of the value to the right format (tokenized, no hyphen) text = v[1] text = text.replace("-", " ") text_tok = nltk.word_tokenize(text) #and only keep relevant words (keywords) words = [] for w in text_tok: if (w.lower() not in stops and is_word(w.lower())): words.append(w.lower()) #and then we compute the score n = len(words) s = 0 #the score for each word of the name is the maximal proximity between this word and the words of the query for w in words: m = 0 for t in lowered_tok: pr = proximity(w, t) m = max(m, pr) s += m #and we divide by the number of words in the name if n > 0: s = s / n else: s = 0 scores.append(s) #we take the value with the highest score, and make sure it is high enough (above a certain threshold) idx = id_max(scores) score_max = scores[idx] if score_max > seuil: final_dict[d] = dim_dict[d][idx][0] #else, we go for the default value else: if dim_default[d] != None: final_dict[d] = dim_default[d] else: final_dict[d] = None return final_dict
def get_data(text, lang): if lang != 'en': return txt = text.replace(' ', '-') url = f'{base}/dictionary/english/{txt}' resp = requests.get(url, headers=headers) resp.raise_for_status() codes = { 'C': 'countable', 'U': 'uncountable', 'S': 'singular', } posgram_found = False gram_found = False if utils.is_word(text): yield ('tag', Term(text='word', lang=lang, region=None)) soup = BeautifulSoup(resp.text, 'html.parser') page = soup.find('div', class_='page') for dictionary in page.find_all('div', class_='dictionary'): header = dictionary.find('div', class_='pos-header') body = dictionary.find('div', class_='pos-body') posgram = header.find('div', class_='posgram') if posgram and not posgram_found: pos = find_strip(posgram, 'span', class_='pos') term = Term(text=pos, lang=lang, region=None) yield ('tag', term) posgram_found = True if not gram_found: for gram in body.find_all('span', class_='gram'): for gc in gram.find_all('span', class_='gc'): code = stripped_text(gc) if code in codes and not gram_found: term = Term(text=codes[code], lang=lang, region=None) yield ('tag', term) gram_found = True # parse pronunciations for dpron in header.find_all('span', class_='dpron-i'): region = find_strip(dpron, 'span', 'region') amp = header.find('amp-audio') for source in amp.find_all('source'): file = File(url=base + source.attrs['src'], region=region) yield ('audio', file) ipa = find_strip(dpron, 'span', class_='ipa') if not is_empty(ipa): yield ('transcription', Term(text=ipa, lang=lang, region=region)) for dblock in body.find_all('div', class_='def-block'): def_text = stripped_text(dblock.find('div', class_='def')) if not is_empty(def_text): yield ('definition', Term(text=def_text, lang=lang, region=None)) img = dblock.find('amp-img') if img is not None: file = File(url=base + img.attrs['src'], region=None) yield ('visual', file) for eg in dblock.find_all('span', 'eg'): term = Term(text=stripped_text(eg), lang=lang, region=None) yield ('in', term) for dataset in page.find_all('div', class_='dataset'): for eg in dataset.find_all('span', class_='deg'): term = Term(text=stripped_text(eg), lang=lang, region=None) yield ('in', term) cpegs = dataset.find('div', class_='cpegs') if cpegs: for lbb in cpegs.find_all('div', class_='lbb'): for a in lbb.find_all('a', class_='hdib'): term = Term(text=stripped_text(a), lang=lang, region=None) yield ('collocation', term) for t in get_translations(text, lang): yield t
def keyword(url): # loading the xml structure as a tree file = urlopen(url) tree = ET.parse(file) root = tree.getroot() header = root[0] structure = root[1] # getting the different parts of the metadata concepts = get_concepts(structure) info = get_info(structure) cat = get_category(structure) dim = get_dimensions(structure) cl = get_codelists(structure) cons = get_constaints(structure) # The 4 categories Name = [] Description = [] Dimensions = [] Category = [] # Name keywords try : n = info["Name"] for w in nltk.word_tokenize(n): if (w.lower() not in stops and is_word(w.lower())): Name.append(w.lower()) except : pass # Description keywords try : d = info["Description"] for w in nltk.word_tokenize(d): if (w.lower() not in stops and is_word(w.lower())): Description.append(w.lower()) except : pass # Category keywords for c in cat: for w in nltk.word_tokenize(c): if (w.lower() not in stops and is_word(w.lower())): Category.append(w.lower()) # Dimension keywords for a in dim: conc = concepts[dim[a][1]] #concept associated if (a not in avoid_dim): for w in nltk.word_tokenize(conc): #words from the title of the dimension if (w.lower() not in stops and is_word(w.lower())): Dimensions.append(w.lower()) for c in cons[a]: #words from the values of the dimension for d in cl[dim[a][2]]: if d[0] == c: for w in nltk.word_tokenize(d[1]): if (w.lower() not in stops and is_word(w.lower())): Dimensions.append(w.lower()) return (set(Name),set(Description),set(Dimensions),set(Category))