コード例 #1
0
    def clean_caption_biography(self):
        caption, biography = self.caption_biography()
        if biography:
            caption.append(biography)
            self.caption = caption
        else:
            self.caption = caption
        clean_cap_bio = []
        for unclean_text in self.caption:
            if isinstance(unclean_text, unicode):
                # remove emoji, hashtag, url, html
                clean_text = ' '.join(
                    re.sub(
                        "([@#][A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)",
                        " ", unclean_text).split())

                clean_text = " ".join(
                    singularize(w).lower()
                    for w in nltk.wordpunct_tokenize(clean_text)
                    if singularize(w).lower() in words and w.isalpha())
                clean_cap_bio.append(clean_text)
            else:
                clean_cap_bio = []
        self.documents = clean_cap_bio
        return self.documents
コード例 #2
0
def get_synset(word, pos=''):
    synset = None
    try:
        if pos:
            synset = wordnet.synsets(singularize(lemmatise(word)), w[pos])[0]
        else:
            synset = wordnet.synsets(singularize(lemmatise(word)))[0]
    except IndexError:
        try:
            if pos:
                synset = wordnet.synsets(lemmatise(word), w[pos])[0]
            else:
                synset = wordnet.synsets(lemmatise(word))[0]
        except IndexError:
            try:
                if pos:
                    synset = wordnet.synsets(singularize(word), w[pos])[0]
                else:
                    synset = wordnet.synsets(singularize(word))[0]
            except IndexError:
                try:
                    if pos:
                        synset = wordnet.synsets(word, w[pos])[0]
                    else:
                        synset = wordnet.synsets(word)[0]
                except IndexError:
                    pass

    if pos and synset is None:
        return get_synset(word)

    return synset
コード例 #3
0
def searchScore(projectDescriptions, tools_materials):
    if len(tools_materials) == 0:
        return
    scoreList = []
    for projDesc in projectDescriptions:
        descEntities = getEntities(projDesc.lower())
        score = 0.0
        for tm in tools_materials:
            for descEnt in descEntities:
                if singularize(tm.lower()) == singularize(descEnt.name):
                    score += 1
        score = score / len(tools_materials)
        scoreList.append(score)
    return scoreList
コード例 #4
0
def main():
    for i in range(len(list1)):
        for label in [list1[i]]:
            for g in   wn.synsets(label):
                #get all gloss definiation
                deftoken=g.definition()
#                print(deftoken)
                #get all tokenize
                tkn=word_tokenize(deftoken)  
                for w in tkn:
                    # remove stop words
                    if w not in stop_words and len(w)>1:
            #b.	Convert all plural nouns form (irregular and regular) to singular noun form
            #using pattern library in python.
                        token=singularize(w)
             
                        token = ''.join(i for i in token if not i.isdigit())
                        token=token.lower()
                        token=WordNetLemmatizer().lemmatize(token,'v')
            # # adjective: pos='a' 
                        token=WordNetLemmatizer().lemmatize(token,'a')
            #  adverb: pos='r' 
                        token=WordNetLemmatizer().lemmatize(token,'r')
                        wordFromList1 = wn.synsets(label)
                        wordFromList2 = wn.synsets(token)
                        if wordFromList1 and wordFromList2: 
                            s = wordFromList1[0].path_similarity(wordFromList2[0])
                            if s != None and s > 0.001:
                                #save all ontologies in a txt file 
                                print(label,'~',token,'=',s,'.', file=data)
コード例 #5
0
def wikifilter(keyword):
    wiki_wiki = wikipediaapi.Wikipedia('en')
    candidate = {}
    for key in keyword.keys():
        page_py = wiki_wiki.page(key)
        if page_py.exists() == True:
            candidate[key] = keyword[key]
        elif page_py.exists() == False:
            singles = singularize(key)
            page_py = wiki_wiki.page(singles)
            if page_py.exists() == True:
                candidate[singles] = keyword[key]
    #     print(candidate)

    final = {}
    redirect = {}
    relation = {}

    for ca in candidate:
        query = requests.get(
            r'https://en.wikipedia.org/w/api.php?action=query&titles={}&&redirects&format=json'
            .format(ca))
        data = json.loads(query.text)
        PAGES = data["query"]["pages"]
        for v in PAGES.values():
            redirect[ca] = v["title"]
            relation[v["title"]] = ca
            final[v["title"]] = 0

    for ca in redirect.keys():
        final[redirect[ca]] = candidate[ca]
    #     print(final)

    return relation, final
コード例 #6
0
def has_one(model_name, **relation_kwargs):
    """Connect a parent table to a child in a one-to-one relation.

    class Parent:
        child = relationship("Child", uselist=False, back_populates="parent")
    """

    caller_namespace, db, table_name, caller_table_name, caller_class_name = rip_context_info(
        model_name)
    variable_name = table_name

    caller_namespace[f'{variable_name}'] = db.relationship(
        f'{model_name}',
        back_populates=f'{caller_table_name}',
        uselist=False,
        **relation_kwargs,
    )

    try:
        callback = global_relations[caller_table_name][table_name]
    except KeyError:
        callback = None

    if callback:
        back_populates = singularize(table_name)
        callback(back_populates)
コード例 #7
0
ファイル: menu_items.py プロジェクト: tdelam/g1-etl
def _map_categories(category_id, sativa, indica, data, menu_items):
    """
    If the menu item that are % indica and % sativa. If > indica threshold,
    it goes into indica, if > sativa threshold it goes into sativa,
    if neither it goes into hybrid. The other conditions within this will
    map to G1's naming convention, i.e: MMJ Drinks => G1 Drink
    """
    category = data.keys()[data.values().index(category_id)]
    if category.lower() == 'cannabis':
        if sativa > 0 and indica > 0:
            if sativa > 80:
                return 'Sativa'
            if indica > 80:
                return 'Indica'
        else:
            return 'Hybrid'

    if category.lower() == 'paraphernalia':
        return 'Gear'
    if category.lower() == 'tincture':
        return 'Tinctures'
    if category.lower() == 'prerolled':
        return 'Preroll'
    if category in PLURAL_CATEGORIES:
        return singularize(category)
    if category not in CAT_MAP:
        return 'Other'
    return category
コード例 #8
0
ファイル: phpdocument.py プロジェクト: ashiswin/modelgen
    def body(self):
        for c in self.columns:
            if c[0] == 'id':
                continue
            self.document += "\t$" + c[0] + " = $_POST['" + c[0] + "'];\n"
        self.document += "\n"
        self.document += "\t$" + self.connectorName + " = new " + self.connectorName + "($conn);\n\n"
        self.document += "\tif(!$" + self.connectorName + "->create("
        first = True
        for c in self.columns:
            if c[0] == 'id':
                continue

            if not first:
                self.document += ", "

            first = False
            self.document += "$" + c[0]
        self.document += ")) {\n"
        self.document += "\t\t$response['success'] = false;\n"
        self.document += "\t\t$response['message'] = \"Failed to create " + singularize(
            self.table) + "!\";\n"
        self.document += "\t}\n"
        self.document += "\telse {\n"
        self.document += "\t\t$response['success'] = true;\n"
        self.document += "\t}\n\n"
        self.document += "\techo(json_encode($response));\n"
        self.document += "?>"
コード例 #9
0
ファイル: utils.py プロジェクト: kmwenja/ftm
    def stem(self, word):
        token = singularize(word)

        conjugation = conjugate(token, 'inf')
        if conjugation:
            token = str(conjugation)

        return token
コード例 #10
0
def get_synset(word, wpos):
    synset = None
    try:
        synset = wordnet.synsets(singularize(lemmatise(word)), wpos)[0]
    except IndexError:
        pass

    return synset
コード例 #11
0
 def scrape_tools(self):
     tools = []
     with open(self.page) as fp:
         line = fp.readline()
         while line:
             tools.append(singularize(line.strip().lower()))
             line = fp.readline()
     return tools
コード例 #12
0
ファイル: phpdocument.py プロジェクト: ashiswin/modelgen
    def __init__(self, t):
        self.connectorName = singularize(t.title()) + "Connector"
        self.table = t

        self.document = "<?php\n"

        self.includes()
        self.body()
コード例 #13
0
ファイル: phpdocument.py プロジェクト: ashiswin/modelgen
 def body(self):
     self.document += "\t$id = $_POST['id'];\n\n"
     self.document += "\t$" + self.connectorName + " = new " + self.connectorName + "($conn);\n\n"
     self.document += "\t$response['" + singularize(
         self.table) + "'] = $" + self.connectorName + "->delete($id);\n"
     self.document += "\t$response['success'] = true;\n\n"
     self.document += "\techo(json_encode($response));\n"
     self.document += "?>"
コード例 #14
0
def wordOrPluralInList(checkList, referenceList, i):
    import pattern
    from pattern.text.en import pluralize
    from pattern.text.en import singularize
    if checkList[i] in referenceList or pluralize(
            checkList[i]) in referenceList or singularize(
                checkList[i]) in referenceList:
        return True
コード例 #15
0
def parse(pklfile, words, modify_dict):
    good_words = []
    with open(pklfile,'rb') as pkl:
        x = pickle.load(pkl)
    final_transcript = []
    for result in x.results:
        alts = result.alternatives[:5]
        for _, alt in enumerate(alts):
            trans = alt.transcript
            raw_words = [x.lower() for x in trans.split(' ') if x]
            good_words.extend([x for x in raw_words if x in words])
            good_words.extend([remove_nonalpha(x) for x in raw_words if remove_nonalpha(x) in words])
            good_words.extend([singularize(x) for x in raw_words if singularize(x) in words])
            good_words.extend([singularize(remove_nonalpha(x)) for x in raw_words if singularize(remove_nonalpha(x)) in words])
            good_words.extend([modify_match(x,modify_dict,1) for x in raw_words if modify_match(x, modify_dict,1) in words])
    good_words = set(good_words)
    return good_words
コード例 #16
0
def similar(text1, text2):
    found = False
    count = 0

    text1 = text1.split(' ')
    text2 = text2.split(' ')

    # ----------------------------------------------
    # Singularizing the words
    text1 = [singularize(word) for word in text1]

    # Singularizing the words
    text2 = [singularize(word) for word in text2]

    # ----------------------------------------------
    #Conerting in lower case
    text1 = [word.lower() for word in text1]

    #Conerting in lower case
    text2 = [word.lower() for word in text2]

    # ----------------------------------------------
    # Converting into present form
    text1 = [WordNetLemmatizer().lemmatize(word, 'v') for word in text1]

    # Converting into present form
    text2 = [WordNetLemmatizer().lemmatize(word, 'v') for word in text2]


    for word in text1:
        if len(word) > 3:
            if singularize(word) in text2:
                count += 1
            else:
                for syn in wordnet.synsets(word):
                    for name in set(syn.lemma_names()):
                        if name in text2:
                            count += 1
                            found = True
                            break
                    if found:
                        found = False
                        break
    return count
コード例 #17
0
def sort(df, dhlw):
    # Used to store our cleaned subject data
    cleaned_data = pd.DataFrame(columns=['doi', 'subjects', 'title'])
    cleaned_data_filename = 'data/tru_cleaned.csv'
    if dhlw:
        cleaned_data_filename = 'data/dhlw_cleaned.csv'

    blank_subjects = 0  # number that OSTI listed as blank...
    removed_subjects = 0  # number of subjects that were all digits, dots, *, -, and whitespaces
    #p = nltk.PorterStemmer()

    for i, r in df.iterrows():
        subjects_str = r['subjects']
        if not pd.isnull(subjects_str):
            subjects = subjects_str.split(";")

            cleaned_subjects = []
            for s in subjects:
                cleaned_s = s.lower().strip(
                )  # first cleans by removing whitespace and then putting it all to lowercase
                cleaned_s = cleaned_s.lstrip(
                    '0123456789.-* '
                )  # removes all digits, dots, dashes, and spaces from the start

                if cleaned_s != "":
                    # converts the last word in the subject to be singular
                    cleaned_s_words = cleaned_s.split(" ")
                    cleaned_s_words[len(cleaned_s_words) - 1] = singularize(
                        cleaned_s_words[len(cleaned_s_words) - 1])
                    cleaned_s = " ".join(cleaned_s_words)

                    subject_counts[cleaned_s] += 1
                    cleaned_subjects.append(cleaned_s)
                else:
                    if s == "":
                        blank_subjects += 1
                    else:
                        removed_subjects += 1

            subjects_str = ';'.join(cleaned_subjects)
        else:
            subjects_str = ""

        cleaned_data = cleaned_data.append(pd.DataFrame(
            {
                'title': r['title'],
                'doi': r['doi'],
                'subjects': subjects_str
            },
            index=[0]),
                                           ignore_index=True)

    cleaned_data.to_csv(cleaned_data_filename, sep='|')

    print("Blank subjects: " + str(blank_subjects))
    print("Removed subjects: " + str(removed_subjects))
コード例 #18
0
def wordOrPluralInListNoLoop(checkList, referenceList):
    import pattern
    from pattern.text.en import pluralize
    from pattern.text.en import singularize
    if checkList in referenceList or pluralize(
            checkList) in referenceList or singularize(
                checkList) in referenceList:
        return True
    else:
        return False
コード例 #19
0
    def scrape_seafood(self):
        seafood = []
        fullHTML = self.soup.find_all('li')
        for x in range(11, 98):
            seafood.append(
                singularize(
                    re.sub('\(.*?\)', '', fullHTML[x].text.strip().encode(
                        'utf-8').lower()).strip()))

        return seafood
コード例 #20
0
def preText(text, pos_bow, neg_bow):
    # parameter:
    # text : takes a sentence, string
    # pos_bow: positive bag of words, list
    # neg_bow: negative bag of words, list

    # return:
    # score_word_sim : similarity score for all verbs, float
    # score_bow: score for bag of words implementation, float

    # recognize verb pattern
    pattern = [{
        "POS": "VERB",
        "OP": "*"
    }, {
        "POS": "ADV",
        "OP": "*"
    }, {
        "POS": "VERB",
        "OP": "+"
    }, {
        "POS": "PART",
        "OP": "*"
    }]

    # extract verb pattern
    doc = textacy.make_spacy_doc(text, lang='en_core_web_lg')
    verbs = textacy.extract.matches(doc, pattern)
    score_word_sim = 0.0
    score_bow = 0.0
    for verb in verbs:
        # singularize verb, e.g. "likes" to "like"
        singularized_verb = singularize(verb.text)
        score_word_sim += wordSimilarity(pos_bow, neg_bow, singularized_verb)
        # apply bag of words to the singularized verb
        score_bow += pos_bow.count(str(singularized_verb))
        score_bow -= neg_bow.count(str(singularized_verb))

    # aggregate all verb similarity
    if score_word_sim > 0.5:
        score_word_sim = 1.0
    elif score_word_sim < -0.5:
        score_word_sim = -1.0
    else:
        score_word_sim = 0.0

    # aggregate the count with bag of words
    if score_bow > 0.5:
        score_bow = 1.0
    elif score_bow < -0.5:
        score_bow = -1.0
    else:
        score_bow = 0.0

    return score_word_sim, score_bow
コード例 #21
0
    def Monitor_DeleteNonEnglishWords(sentence):
        # We will clean the sentence from non-english words
        nltk.download('words')
        words = set(nltk.corpus.words.words())
        sentence = " ".join(
            w for w in nltk.wordpunct_tokenize(sentence)
            if singularize(w).lower() in words or not w.isalpha())

        if sentence[-1] != '.':
            sentence += '.'
        return sentence
コード例 #22
0
 def scrape_meats(self):
     meats = []
     article = self.soup.find_all('article',
                                  {"class": "tag-animal-protein-list"})
     fullHTML = []
     for art in article:
         fullHTML = art.find_all("li")
     for x in range(0, len(fullHTML)):
         meats.append(fullHTML[x].text.strip().encode('utf-8').lower())
     self.meats = meats
     meats = [singularize(m.strip()) for m in meats]
     return meats
コード例 #23
0
def normalize(string):
    plurale_tantum = ['this', 'yes', 'pants', 'shorts', 'glasses', 'scissors', 'panties', 'trousers', 'binoculars', 'pliers', 'tongs',\
        'tweezers', 'forceps', 'goggles', 'jeans', 'tights', 'leggings', 'chaps', 'boxers', 'indoors', 'outdoors', 'bus', 'octapus', 'waitress',\
        'pasta', 'pita', 'glass', 'asparagus', 'hummus', 'dress', 'cafeteria', 'grass', 'class']

    irregulars = {'shelves': 'shelf', 'bookshelves': 'bookshelf', 'olives': 'olive', 'brownies': 'brownie', 'cookies': 'cookie'}
    
    temp = string.strip().lower()
    if temp in irregulars:
        return irregulars[temp]
    
    return temp if temp.split(' ')[-1] in plurale_tantum or temp[-2:] == 'ss' else singularize(temp)
コード例 #24
0
def get_synset(phrase):
    synset = None
    for word, pos in tag(phrase):
        if pos.startswith('N') and word != 'of':
            try:
                synset = wordnet.synsets(singularize(lemmatise(word)))[0]
            except IndexError:
                try:
                    synset = wordnet.synsets(lemmatise(word))[0]
                except IndexError:
                    try:
                        synset = wordnet.synsets(singularize(word))[0]
                    except IndexError:
                        try:
                            synset = wordnet.synsets(word)[0]
                        except IndexError:
                            logging.error("Failed to find synset for '" + word + "'")
                            continue
        elif pos == 'PRP':
            return wordnet.synsets('living thing')[0]

    return synset
コード例 #25
0
def processIngredients(ingredient):
    results = []
    parsed = [singularize(e) for e in sent_parse(ingredient)]
    for element in parsed:
        # exact match
        if element in foodList:
            results += [element]
        # split current word and exact match each sub-word
        else:
            for w in element.split(' '):
                if w in foodList:
                    results += [w]
    return list(set(results))
コード例 #26
0
def wikifilter(keyword):
    wiki_wiki = wikipediaapi.Wikipedia('en')
    candidate = {}
    final = {}
    redirect = {}
    relation = {}

    for key in keyword.keys():
        page_py = wiki_wiki.page(key)
        if page_py.exists() == True:
            query = requests.get(
                r'https://en.wikipedia.org/w/api.php?action=query&titles={}&&redirects&format=json'
                .format(key))
            data = json.loads(query.text)
            PAGES = data["query"]["pages"]
            #print(PAGES)
            for v in PAGES.values():
                redirect[key] = v["title"]
                #print(redirect)
                temp_list = relation.get(v["title"], [])
                temp_list.append(key)
                relation[v["title"]] = temp_list
                #print(relation)
                final[v["title"]] = 0

        elif page_py.exists() == False:
            singles = singularize(key)
            page_py = wiki_wiki.page(singles)
            if page_py.exists() == True:
                query = requests.get(
                    r'https://en.wikipedia.org/w/api.php?action=query&titles={}&&redirects&format=json'
                    .format(singles))
                data = json.loads(query.text)
                PAGES = data["query"]["pages"]
                #print(PAGES)
                for v in PAGES.values():
                    redirect[key] = v["title"]
                    #print(redirect)
                    temp_list = relation.get(v["title"], [])
                    temp_list.append(key)
                    relation[v["title"]] = temp_list
                    # print(relation)
                    final[v["title"]] = 0

    for k in redirect.keys():
        final[redirect[k]] = final[redirect[k]] + keyword[k]


#     print(final)

    return relation, final
コード例 #27
0
def convert_to_ml(unit, conversion):
    """
    Convert the unit, usually in volume, 
    to a weight so we can calculate the fraction of ingredient present/
    """
    unit = singularize(unit.lower())

    qty_ml = 0
    try:
        qty_ml = conversion[unit]
    except:
        qty_ml = 1

    return qty_ml
コード例 #28
0
ファイル: common.py プロジェクト: RameezAijaz10P/tmpRepo
def clean_words(description):
    words = re.split("[/ ]+", str(description))
    keywords = []
    for word in words:
        word = word.lower()
        if word not in stop_words:
            word = re.sub('[^A-Za-z]+', '', word)
            if word is not '':
                word = WordNetLemmatizer().lemmatize(word, 'v')
                word = singularize(word)
                word = spell(word)
                keywords.append(str(word))

    return keywords
コード例 #29
0
    def _validate(self, clue: str, positiveWords: np.array,
                  negativeWords: np.array) -> bool:
        clue = clue.lower()

        invalidWords: np.array = np.append(
            self.previousClues, np.append(positiveWords, negativeWords))
        stemmedClue: str = stem(clue)
        singularClue: str = singularize(clue)
        pluralClue: str = pluralize(clue)

        if not clue.isalpha() or not clue.isascii() or set(
                "aeiouy").isdisjoint(clue) or not 2 <= len(clue) <= 12:
            return False

        for word in invalidWords:
            stemmedWord = stem(word)
            singularWord = singularize(word)
            pluralWord = pluralize(word)
            if clue in word or word in clue or stemmedClue in word or stemmedWord in clue or \
                    singularClue in word or singularWord in clue or pluralClue in word or pluralWord in clue:
                return False

        return True
コード例 #30
0
ファイル: common.py プロジェクト: rarmitag/namex
def get_plural_singular_name(name):
    d = {}
    for word in name.split():
        val = []
        singular = singularize(word.lower())
        val.extend([singular])
        val.extend([word.lower()])
        d[word] = (list(set(val)))

    name_list = []
    for combination in product(*d.values()):
        name_list.append(' '.join(combination))

    return name_list
コード例 #31
0
def convert_postag(complex_word, candidates):
    specific_tag = NLP.pos_tag(complex_word)[0][1]
    generic_tag = get_type(specific_tag)
    # print(generic_tag)
    final_candidates = set()
    if generic_tag == "NN":  ### Nouns
        # print(generic_tag)
        for candidate in candidates:
            candidate_tag = NLP.pos_tag(candidate)[0][1]
            if specific_tag == "NNS" and candidate_tag != "NNS":
                candidate = pluralize(candidate)
                # print("pluraaal  ", candidate)
            elif specific_tag == "NN" and candidate_tag == "NNS":
                candidate = singularize(candidate)
                # print("singulaaar" , candidate)
            # print("wwilll add")
            final_candidates.add(candidate)
    elif generic_tag == "ADJ":  ## Adjectives
        for candidate in candidates:
            candidate_tag = NLP.pos_tag(candidate)[0][1]
            if specific_tag == "JJR" and candidate_tag != "JJR":
                candidate = comparative(candidate)
                # print(candidate , "jjr")
            elif specific_tag == "JJS" and candidate_tag != "JJS":
                # print(candidate , "jjs")
                candidate = superlative(candidate)
            # print(candidate , "added")
            final_candidates.add(candidate)
    elif generic_tag == "VB":  ## Verbs
        complex_tense = tenses(complex_word)
        if (len(complex_tense)) < 1: return candidates

        for candidate in candidates:
            # print("my tense" ,  complex_tense.upper()  ," candidate " , candidate , " ", tenses(candidate)[0][0] )
            if len(tenses(candidate)) > 0 and tenses(
                    candidate)[0][0] != complex_tense:
                if complex_tense == "past":
                    candidate = conjugate(candidate, tense=PAST)
                elif complex_tense == "present":
                    candidate = conjugate(candidate, tense=PRESENT)
                elif complex_tense == "future":
                    candidate = conjugate(candidate, tense=FUTURE)
                elif complex_tense == "infinitive":
                    candidate = conjugate(candidate, tense=INFINITIVE)
            final_candidates.add(candidate)
    else:
        final_candidates = candidates

    return final_candidates
コード例 #32
0
def prepare_text(input):
    sentences = nltk.sent_tokenize(input)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            sentences[i][j] = de_nonletter.sub('', sentences[i][j].lower())

    sentences = [deleteEmpty(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    for sentence in sentences:
        for i in range(len(sentence)):
            if sentence[i][1] == 'NNS':
                tlst = list(sentence[i])
                tlst[0] = singularize(tlst[0])
                tlst[1] = 'NN'
                sentence[i] = tuple(tlst)
    sentences = [NPChunker.parse(sent) for sent in sentences]
    return sentences
コード例 #33
0
def get_best_unit(ingred_list, conversion):
    """
    Some ingredients, have different options like
      1 pkg (7.5 oz) 
    the code will identify both, this function tells which one to use
    """
    for ingred in ingred_list:
        # print(ingred)
        inspect = ingred.strip().split(' ')
        # print(inspect[1].lower())

        new_unit = singularize(inspect[1].lower())

        if new_unit in list(conversion.keys()):
            return ingred

    # could not find it in the list use the first one
    return ingred_list[0]
コード例 #34
0
def inject_nn_error(example):
    tok_sentence, features_seq = example[0].split(), example[1]
    noun_indices = []
    for i in range(len(features_seq)):
        if features_seq[i]['universal_postag'] == 'NOUN':
            noun_indices.append(i)

    if len(noun_indices) > 0:
        j = random.choice(noun_indices)
        singular_form = pattern.singularize(tok_sentence[j])
        plural_form = pattern.pluralize(tok_sentence[j])
        if tok_sentence[j] != singular_form:
            tok_sentence[j] = singular_form
        elif tok_sentence[j] != plural_form:
            tok_sentence[j] = plural_form
    else:
        return None
    return [' '.join(tok_sentence), features_seq]
コード例 #35
0
def create_characters(dependencies):
    characters = []
    for dependency in dependencies:
        cpostag = dependency['CPOSTAG']
        if not (cpostag.startswith('N') or cpostag.startswith('PR')):
            dependency['CHARACTER_ID'] = ''
            continue

        global NEXT_CHARACTER_ID
        dependency['CHARACTER_ID'] = str(NEXT_CHARACTER_ID)
        NEXT_CHARACTER_ID += 1

        form = dependency['FORM']
        words = form.split(' ')
        gender = ''
        object_state = ''

        if dependency['CPOSTAG'].startswith('P'):
            if set(words) & PLURAL_PRONOUNS:
                num = 'pl'
            else:
                num = 'sg'

            if set(words) & MALE_PRONOUNS:
                gender = 'm'
            elif set(words) & FEMALE_PRONOUNS:
                gender = 'f'
            elif set(words) & NEUTRAL_PRONOUNS:
                gender = 'n'

            if not gender == 'n':
                object_state = 'a'

        else:
            if dependency['POSTAG'].endswith('S'):
                num = 'pl'
            else:
                num = 'sg'

            try:
                synset = wordnet.synsets(singularize(lemmatise(words[-1])))[0]
            except IndexError:
                try:
                    synset = wordnet.synsets(lemmatise(words[-1]))[0]
                except IndexError:
                    try:
                        synset = wordnet.synsets(singularize(words[-1]))[0]
                    except IndexError:
                        try:
                            synset = wordnet.synsets(words[-1])[0]
                        except IndexError:
                            logging.error("Failed to find synset for '" + words[-1] + "'")
                            continue

            hyps = set()
            for h in synset.hypernyms(recursive=True):
                try:
                    hyps.add(h.gloss)
                except ValueError:
                    continue

            hyps.add(synset.gloss)

            object_state = determine_object_state(hyps)
            gender = determine_gender(hyps, object_state)

        character = Character(dependency['ID'], num, gender, object_state)
        character.text = form
        if dependency['POSTAG'].startswith('P') and not set(words) & NON_RESOLUTION_PRONOUNS:
            character.is_pronoun = True
        character.add_relation("IsA", form)
        characters.append(character)

    return characters