def get_search_results(query, exact_match=False): query = join_bigrams.bigramar(query.lower(), bigrams) query = [query] if exact_match else query.split() results = [[], [], []] for row in data: query_match_level = 0 for query_word in query: search_list = [query_word] if exact_match else [ query_word, Inflector.pluralize(query_word), Inflector.singularize(query_word) ] word_match_level = consts.NOT_FOUND for word in search_list: word_match_level = min(word_match_level, get_match_level(word, row)) query_match_level = max(query_match_level, word_match_level) if query_match_level != consts.NOT_FOUND: results[query_match_level].append(copy.copy(row)) merged_results = results[0] + results[1] + results[2] for result in merged_results: for level in range(3): for column in consts.SEARCH_COLUMNS[level]: result.pop(column + '_processed') return json.dumps(merged_results)
def process_spanish_owned(): from inflector import Inflector, Spanish inflector = Inflector(Spanish) from nltk.stem import SnowballStemmer stemmer = SnowballStemmer("spanish") file_valid = open('valid_words.txt', "r") lines = file_valid.readlines() valid_words = lines[0].split(' ') print len(valid_words) file_valid.close() #valid_words = set(valid_words) owned_words = ['cúster', 'custer', 'cústers', 'custers', 'combi', 'combis', 'susana', 'villaran', 'villarán', 'castañeda'] file = open("raw_words.txt", 'r') fileout = open("spanish_words_owned.txt", 'w') fout_sing = open("spanish_words_sing.txt", 'w') fout_stem = open("spanish_words_stem.txt", 'w') nline = 0 for line in file: nline += 1 words = line.split(' ') processed = [] ini_line = True for word in words: if (word != '') & (word != '\n') & (word != 'servicio') & (word != 'servicio\n'): word = word.replace('\n', '') if (word in valid_words) | (word in owned_words): processed.append(word) if word != 'bus': word_singular = inflector.singularize(word) #word_singular = word_singular.replace(u'\xF3'.encode('utf-8'), 'o') else: word_singular = word word_stemmed = stemmer.stem(word.decode('utf-8')).encode('utf-8') if ini_line: fileout.write(word) fout_sing.write(word_singular) fout_stem.write(word_stemmed) ini_line = False else: fileout.write(' ' + word) fout_sing.write(' ' + word_singular) fout_stem.write(' ' + word_stemmed) print nline, word, word_singular, word_stemmed fileout.write('\n') fout_sing.write('\n') fout_stem.write('\n') file.close() fileout.close() fout_sing.close() fout_stem.close()
def count_noun(tagged_tokens): """ 명사 추출 및 복수를 단수로 변환하는 작업 :param tagged_tokens: :return: """ noun_dict = defaultdict(lambda: 0) except_noun = [".", ",", "$", "[", "]", ">", "<", "/*", "*/", "*", "+", "-", "=", "%"] mongo_error_keyword = ['.', ',', '$'] inflector = Inflector(English) for tagged_token in tagged_tokens: if "NN" in tagged_token[1]: noun = inflector.singularize(tagged_token[0].lower()) if noun in except_noun \ or any(filter(lambda x: x in noun, mongo_error_keyword)) \ or not noun: continue noun_dict[noun] += 1 return dict(noun_dict)
class EnglishInflectorTestCase(unittest.TestCase): singular_to_plural = { "search" : "searches", "switch" : "switches", "fix" : "fixes", "box" : "boxes", "process" : "processes", "address" : "addresses", "case" : "cases", "stack" : "stacks", "wish" : "wishes", "fish" : "fish", "category" : "categories", "query" : "queries", "ability" : "abilities", "agency" : "agencies", "movie" : "movies", "archive" : "archives", "index" : "indices", "wife" : "wives", "safe" : "saves", "half" : "halves", "move" : "moves", "salesperson" : "salespeople", "person" : "people", "spokesman" : "spokesmen", "man" : "men", "woman" : "women", "basis" : "bases", "diagnosis" : "diagnoses", "datum" : "data", "medium" : "media", "analysis" : "analyses", "node_child" : "node_children", "child" : "children", "experience" : "experiences", "day" : "days", "comment" : "comments", "foobar" : "foobars", "newsletter" : "newsletters", "old_news" : "old_news", "news" : "news", "series" : "series", "species" : "species", "quiz" : "quizzes", "perspective" : "perspectives", "ox" : "oxen", "photo" : "photos", "buffalo" : "buffaloes", "tomato" : "tomatoes", "dwarf" : "dwarves", "elf" : "elves", "information" : "information", "equipment" : "equipment", "bus" : "buses", "status" : "statuses", "mouse" : "mice", "louse" : "lice", "house" : "houses", "octopus" : "octopi", "virus" : "viri", "alias" : "aliases", "portfolio" : "portfolios", "vertex" : "vertices", "matrix" : "matrices", "axis" : "axes", "testis" : "testes", "crisis" : "crises", "rice" : "rice", "shoe" : "shoes", "horse" : "horses", "prize" : "prizes", "edge" : "edges" } def setUp(self): self.inflector = Inflector(English) def tearDown(self): self.inflector = None def test_pluralize(self) : for singular in self.singular_to_plural.keys() : assert self.inflector.pluralize(singular) == self.singular_to_plural[singular], \ 'English Inlector pluralize(%s) should produce "%s" and NOT "%s"' % (singular, self.singular_to_plural[singular], self.inflector.pluralize(singular)) def test_singularize(self) : for singular in self.singular_to_plural.keys() : assert self.inflector.singularize(self.singular_to_plural[singular]) == singular, \ 'English Inlector singularize(%s) should produce "%s" and NOT "%s"' % (self.singular_to_plural[singular], singular, self.inflector.singularize(self.singular_to_plural[singular]))
class EnglishInflectorTestCase(unittest.TestCase): singular_to_plural = { "search": "searches", "switch": "switches", "fix": "fixes", "box": "boxes", "process": "processes", "address": "addresses", "case": "cases", "stack": "stacks", "wish": "wishes", "fish": "fish", "category": "categories", "query": "queries", "ability": "abilities", "agency": "agencies", "movie": "movies", "archive": "archives", "index": "indices", "wife": "wives", "safe": "saves", "half": "halves", "move": "moves", "salesperson": "salespeople", "person": "people", "spokesman": "spokesmen", "man": "men", "woman": "women", "basis": "bases", "diagnosis": "diagnoses", "datum": "data", "medium": "media", "analysis": "analyses", "node_child": "node_children", "child": "children", "experience": "experiences", "day": "days", "comment": "comments", "foobar": "foobars", "newsletter": "newsletters", "old_news": "old_news", "news": "news", "series": "series", "species": "species", "quiz": "quizzes", "perspective": "perspectives", "ox": "oxen", "photo": "photos", "buffalo": "buffaloes", "tomato": "tomatoes", "dwarf": "dwarves", "elf": "elves", "information": "information", "equipment": "equipment", "bus": "buses", "status": "statuses", "mouse": "mice", "louse": "lice", "house": "houses", "octopus": "octopi", "virus": "viri", "alias": "aliases", "portfolio": "portfolios", "vertex": "vertices", "matrix": "matrices", "axis": "axes", "testis": "testes", "crisis": "crises", "rice": "rice", "shoe": "shoes", "horse": "horses", "prize": "prizes", "edge": "edges" } def setUp(self): self.inflector = Inflector(English) def tearDown(self): self.inflector = None def test_pluralize(self): for singular in self.singular_to_plural.keys(): assert self.inflector.pluralize(singular) == self.singular_to_plural[singular], \ 'English Inlector pluralize(%s) should produce "%s" and NOT "%s"' % (singular, self.singular_to_plural[singular], self.inflector.pluralize(singular)) def test_singularize(self): for singular in self.singular_to_plural.keys(): assert self.inflector.singularize(self.singular_to_plural[singular]) == singular, \ 'English Inlector singularize(%s) should produce "%s" and NOT "%s"' % (self.singular_to_plural[singular], singular, self.inflector.singularize(self.singular_to_plural[singular]))
def get_question_type(q_word, question): q_word = q_word.lower() question = question.lower() inf = Inflector() question = inf.singularize(question) if q_word == 'what' or q_word == 'which': if 'what country' in question or \ 'what state' in question or \ 'what continental' in question or \ 'what place' in question or \ 'what city' in question or \ 'what province' in question or \ 'what river' in question or \ 'what region' in question or \ 'what area' in question or \ 'what nationality' in question or \ 'what town' in question or \ 'what borough' in question or \ 'what location' in question: return set(['LOCATION']) if 'what year' in question or \ 'what month' in question or \ 'what day' in question or \ 'what date' in question: return set(['DATE']) if 'what percentage' in question or 'what percent' in question: return set(['PERCENT']) if 'what company' in question or \ 'what group' in question or \ 'what organization' in question or \ 'what university' in question or \ 'what school' in question or \ 'what team' in question or \ 'what program' in question or \ 'what party' in question: return set(['ORGANIZATION']) if 'what artist' in question or \ 'what actor' in question or \ 'what actress' in question or \ 'what doctor' in question or \ 'what president' in question or \ 'what person' in question: return set(['PERSON']) if 'which country' in question or \ 'which state' in question or \ 'which continental' in question or \ 'which place' in question or \ 'which city' in question or \ 'which province' in question or \ 'which river' in question or \ 'which region' in question or \ 'which area' in question or \ 'which nationality' in question or \ 'which town' in question or \ 'which borough' in question or \ 'which location' in question: return set(['LOCATION']) if 'which year' in question or \ 'which month' in question or \ 'which day' in question or \ 'which date' in question: return set(['DATE']) if 'which percentage' in question or 'which percent' in question: return set(['PERCENT']) if 'which company' in question or \ 'which group' in question or \ 'which organization' in question or \ 'which university' in question or \ 'which school' in question or \ 'which team' in question or \ 'which program' in question or \ 'which party' in question: return set(['ORGANIZATION']) if 'which artist' in question or \ 'which actor' in question or \ 'which actress' in question or \ 'which doctor' in question or \ 'which president' in question or \ 'which person' in question: return set(['PERSON']) elif q_word == 'how': if 'how much' in question: return set(['MONEY', 'NUMBER']) if 'how long' in question or 'how old' in question: return set(['TIME', 'DURATION']) if 'how many' in question or 'how far' in question: return set(['NUMBER']) elif q_word == 'where': return set(['LOCATION', 'ORGANIZATION']) elif q_word == 'when': return set(['DATE', 'TIME', 'DURATION']) elif q_word == 'who': return set(['PERSON']) return set(['O'])
class SpanishInflectorTestCase(unittest.TestCase): singular_to_plural = { "álbum": "álbumes", "almacén": "almacenes", "androide": "androides", "antifaz": "antifaces", "árbol": "árboles", "atlas": "atlas", "autobús": "autobuses", "base": "bases", "bebé": "bebés", "camión": "camiones", "casa": "casas", "ceutí": "ceutíes", "chimpancé": "chimpancés", "clan": "clanes", "compás": "compases", "convoy": "convoyes", "coxis": "coxis", "crisis": "crisis", "déficit": "déficits", "eje": "ejes", "espíritu": "espíritus", "flash": "flashes", "frac": "fracs", "gafas": "gafas", "hipótesis": "hipótesis", "inglés": "ingleses", "lápiz": "lápices", "luz": "luces", "montaje": "montajes", "no": "noes", "otitis": "otitis", "padre": "padres", "país": "países", "papá": "papás", "parking": "parkings", "portaequipaje": "portaequipajes", "radiocasete": "radiocasetes", "show": "shows", "si": "sis", "sí": "síes", "tabú": "tabúes", "tamiz": "tamices", "tanque": "tanques", "taxi": "taxis", "tijeras": "tijeras", "tren": "trenes", "virus": "virus", } def setUp(self): self.inflector = Inflector(Spanish) def tearDown(self): self.inflector = None def test_pluralize(self): for singular, plural in self.singular_to_plural.iteritems(): inflector_pluralize = self.inflector.pluralize(singular) assert inflector_pluralize == plural, \ 'Spanish Inflector pluralize(%s) should produce "%s" and NOT "%s"' % ( singular, plural, inflector_pluralize) def test_singularize(self): for singular, plural in self.singular_to_plural.iteritems(): inflector_singularize = self.inflector.singularize(plural) assert inflector_singularize == singular, \ 'Spanish Inflector singularize(%s) should produce "%s" and NOT "%s"' % ( plural, singular, inflector_singularize)