Exemple #1
0
def correct_case(value: str) -> str:
    """Correct the case in uppercase only text.

    Normalize all uppercase text, changing it to lower case but capitalizing
    correctly the start of sentences, proper nouns, etc.

    """
    if value.upper() == value:
        return truecase.get_true_case(value)
    elif value.lower() == value:
        return truecase.get_true_case(value)
    return value
Exemple #2
0
def extract_pos_features(segments):
    """
    :param segments: List of text segments. Each segment is a string.
    :return: feats_dict: Dictionary mapping feature name to value for transcript
    Note:  POS is more accurate for segments that contain capitalization, so if text is fully lowercase
           this function will try to infer the true casing before POS detection.
    """
    # initialize feature dictionary with POS types
    feats_dict = dict((key, 0) for key in POS_KEY_LIST)
    num_words = 0
    # add POS count features
    for segment in segments:
        lowercase = segment.islower()
        # split up into words
        segment = segment.split(" ")
        num_words += len(segment)
        if lowercase:
            # if text has been lowercased,
            # transform to true case (i.e. capitalize if supposed to be), so that POS tagger works better
            segment_str = " ".join(segment)
            truecase_str = truecase.get_true_case(segment_str)
            segment = truecase_str.split(" ")
        if '' in set(segment):
            segment[:] = [w for w in segment if w != '']
        pos_seg = nltk.pos_tag(segment)
        for word, tag in pos_seg:
            update_feature_vals(tag, feats_dict)
    get_pos_ratios(feats_dict)
    # convert counts to proportions
    for key in POS_KEY_LIST:
        count = float(feats_dict[key])
        feats_dict[key] = count / float(num_words)
    return feats_dict
Exemple #3
0
 def _normalize_text(self, text: str) -> str:
     normalized_text = truecase.get_true_case(text)
     # Fix some punctuation issue, e.g. `roughly$ 19` bececomes `roughly $19`
     normalized_text = re.sub(
         r"([#$(<[{]) ", lambda x: f" {x.group(1)}", normalized_text
     )
     return normalized_text
Exemple #4
0
    def extract_keywords(self, title: str, graf: str):
        keywords = set([])
        location = None

        sents = [title] + sent_tokenize(graf)
        for sent in sents:
            doc = self.nlp(truecase.get_true_case(sent))
            for ent in doc.ents:
                if ent.label_ == "GPE" and not location:
                    location = ent.text
                phrase = self.remove_stopwords(ent.text.lower())
                if phrase:
                    keywords.add(phrase)
            for chunk in doc.noun_chunks:
                phrase = self.remove_stopwords(chunk.text.lower())
                if phrase:
                    keywords.add(phrase)

            if len(keywords) >= 10:
                break

        return keywords, location


# test
# extractor = KeywordExtractor()
# print(extractor("The state attorneys general in more than a dozen states are preparing to begin an antitrust investigation of the tech giants, according to two people briefed on the discussions, increasing pressure on the companies. The social media companies removed accounts and said they were sowing divisive messages about the Hong Kong protests."))
Exemple #5
0
    def postprocess_text(self, text):
        """Postprocesses and prepares the output summarized text"""

        # Lowercase everything
        text = self.truecasing_by_sentence_segmentation(text)
        text = truecase.get_true_case(text)
        text = self.fix_summary(text)
        return text
Exemple #6
0
def correct_output(in_file,out_file):
    PUNCTUATION_MAPPING = {" !EXCLAMATIONMARK" : "!", " .PERIOD" : ".", " :COLON": ":", " ,COMMA" : ",", " ;SEMICOLON": ";" ," -DASH": "-"," ?QUESTIONMARK" : "?"}
    in_f = open(in_file,'r')
    out_f = open(out_file,'w')
    contents = in_f.read()
    for x in PUNCTUATION_MAPPING:
        contents = contents.replace(x,PUNCTUATION_MAPPING[x])
    contents = truecase.get_true_case(contents)
    out_f.write(contents)
    in_f.close()
    out_f.close()
Exemple #7
0
def calculateParameters(doc: str,
                        scores: Dict[str, float],
                        cands,
                        pr: Dict[str, float] = None):
    params = []

    max_cand_score = max(scores.values())
    all_cands = cands
    for cand in all_cands:

        freq = doc.count(cand)

        # pagerank_score = pr[cand]

        if cand not in scores:
            cand_score = 0.
        else:
            cand_score = scores[cand] / max_cand_score

        cand_len = len(cand)
        cand_term_count = len(cand.split())

        first_match = doc.find(cand) / len(doc)
        last_match = doc.rfind(cand) / len(doc)
        ne_cand = get_true_case(cand)
        words = nltk.pos_tag(nltk.word_tokenize(ne_cand))
        ne = nltk.tree2conlltags(nltk.ne_chunk(words))
        ne = [
            ' '.join(word for word, pos, chunk in group).lower()
            for key, group in itertools.groupby(ne, lambda tpl: tpl[2] != 'O')
            if key
        ]

        ne_cnt = len(ne[0].split()) if ne else 0

        if first_match == last_match:
            spread = 0.
        else:
            spread = last_match - first_match

        params.append([
            cand_score, cand_len, cand_term_count, first_match, 1 - last_match,
            ne_cnt
        ])  #, pagerank_score])  # , r[cand]])

    params = np.array(params)
    max_ = params.max(axis=0)
    params = np.divide(params,
                       max_,
                       out=np.zeros_like(params),
                       where=max_ != 0)
    return dict(zip(all_cands, params))
def get_url2description_and_vectors(url):
    lock.acquire()
    global count
    count += 1
    caption = url2caption[url]
    lock.release()
    case_caption = truecase.get_true_case(caption)
    case_caption = preprocessing(case_caption)
    url2keywords, url2keywords_vector = FindKeywords(case_caption)
    lock.acquire()
    url2description[url] = url2keywords
    url2description_vector[url] = url2keywords_vector
    lock.release()
def truecase_sentence(tokens):
    previous_len = len(tokens)
    word_lst = [(w, idx) for idx, w in enumerate(tokens)
                if all(c.isalpha() for c in w)]
    lst = [w for w, _ in word_lst if re.match(r'\b[A-Z]+\b', w)]
    if len(lst) and len(lst) == len(word_lst):
        parts = truecase.get_true_case(' '.join(lst)).split()
        if len(parts) != len(word_lst):
            return tokens
        for (w, idx), nw in zip(word_lst, parts):
            tokens[idx] = nw
    assert len(tokens) == previous_len
    return tokens
    def postprocess(cls, text: str) -> str:
        """Post-processes the text.

        Args:
            text (:obj:`str`):
                The text to be post-processed.

        Returns:
            :obj:`str`: The post-processed text.
        """

        txt = ' '.join(sentence_tokenize(text))
        return get_true_case(txt)
    def create_data(self, sentence_4):
        query = self.get_query(sentence_4)
        entities_replacements = self.get_entities_replacements(sentence_4)
        sentence_3 = truecase.get_true_case(
            self.get_sentence_3(sentence_4, entities_replacements))
        sentence_2 = truecase.get_true_case(
            self.get_sentence_2(sentence_3, entities_replacements))
        sentence_1 = truecase.get_true_case(entities_replacements[0][2])

        json_data = {
            'query': query,
            'sentence_4': sentence_4,
            'sentence_3': sentence_3,
            'sentence_2': sentence_2,
            'sentence_1': sentence_1
        }
        json_data = json.dumps(json_data, indent=4, sort_keys=False)
        print(json_data)

        self.sia_dataset_reader.write_dataset(query, sentence_4, sentence_3,
                                              sentence_2, sentence_1)

        return (query, sentence_4, sentence_3, sentence_2, sentence_1)
Exemple #12
0
    def dominant_entities_text(stanford_client: StfNERTagger, client_google,
                               comment: str) -> tuple:
        """
        Analyze the text and return the text entities and type entities
        :param stanford_client: connection to stanford client
        :param client_google: connection to google cloud
        :param comment: text to analyze
        :return: tuple with entities text and entitiies type
        """
        if len(comment) == 0:
            return [], [], []

        stf_person_types = stanford_client.identify_person_types(
            text=truecase.get_true_case(comment))

        document = ClientsLanguageSentiment.convert_type_google_document(
            comment)
        encoding = enums.EncodingType.UTF32
        entities_google_text = []
        entities_google_salience = []
        entities_google_type = []

        try:
            entities_google = client_google.analyze_entities(
                document, encoding).entities
            for entity in entities_google:
                entity_type = enums.Entity.Type(entity.type)

                # Discard low density words or high density words that can be related with an entity detection from google, leading to high density sentences
                if len(str(entity.name)) <= 3 or len(str(entity.name)) >= 25:
                    continue

                # Validates if Type.Person from GCloud is valid having Stanford as base line.
                # A lot of people names are not correctly detected by GCP but are from Stanford.
                if (entity_type == enums.Entity.Type.PERSON and entity.name.lower() in stf_person_types) or (
                        entity_type != enums.Entity.Type.PERSON and entity.name.lower() in stf_person_types) \
                        or entity_type in [enums.Entity.Type.NUMBER, enums.Entity.Type.PHONE_NUMBER,
                                           enums.Entity.Type.ADDRESS, enums.Entity.Type.PRICE, enums.Entity.Type.DATE]:
                    continue

                entities_google_text.append(entity.name)
                entities_google_salience.append(entity.salience)
                entities_google_type.append(entity_type.name)
            entities_google_text = list(entities_google_text)
            entities_google_salience = list(map(str, entities_google_salience))
            entities_google_type = list(set(entities_google_type))
        except Exception as e:
            logger.error(msg=str(e))
            return [], [], []
        return entities_google_text, entities_google_salience, entities_google_type
Exemple #13
0
    def truecase_sentence(self, tokens):
        word_lst = [(w, idx) for idx, w in enumerate(tokens)
                    if all(c.isalpha() for c in w)]
        lst = [w for w, _ in word_lst if re.match(r'\b[A-Z\.\-]+\b', w)]
        if len(lst) and len(lst) == len(word_lst):
            parts = truecase.get_true_case(' '.join(lst)).split()

            # the trucaser has its own tokenization ...
            # skip if the number of word dosen't match
            if len(parts) != len(word_lst): return tokens

            for (w, idx), nw in zip(word_lst, parts):
                tokens[idx] = nw
        return tokens
Exemple #14
0
def calculateParameters(all_cands, doc, scores):
    params = []

    max_cand_score = max(scores.values())

    for cand in all_cands:

        freq = doc.count(cand)

        if cand not in scores:
            cand_score = 0.
        else:
            cand_score = scores[cand]  # / max_cand_score

        cand_len = len(cand)
        cand_term_count = len(cand.split())
        ne_cand = get_true_case(cand)
        words = nltk.pos_tag(nltk.word_tokenize(ne_cand))
        ne = nltk.tree2conlltags(nltk.ne_chunk(words))
        ne = [
            ' '.join(word for word, pos, chunk in group).lower()
            for key, group in itertools.groupby(ne, lambda tpl: tpl[2] != 'O')
            if key
        ]

        ne_cnt = len(ne[0].split()) if ne else 0

        first_match = doc.find(cand) / len(doc)
        last_match = doc.rfind(cand) / len(doc)

        # if cand_term_count == 1:
        #     cohesion = 0.
        # else:
        #     cohesion = cand_term_count * (1 + math.log(freq, 10)) * freq /

        if first_match == last_match:
            spread = 0.
        else:
            spread = last_match - first_match

        # print([cand_score, freq, cand_len, cand_term_count, first_match, last_match, spread, ne_cnt])

        params.append([
            cand_score, cand_len, cand_term_count, first_match, last_match,
            spread, ne_cnt
        ])  #cand_score,
    return params
Exemple #15
0
def truecase_sentence(tokens):
    """
    from https://github.com/ghaddarAbs
    for experimenting with CoNLL-2003 casing
    """
    word_lst = [(w, idx) for idx, w in enumerate(tokens) if all(c.isalpha() for c in w)]
    lst = [w for w, _ in word_lst if re.match(r'\b[A-Z\.\-]+\b', w)]

    if len(lst) and len(lst) == len(word_lst):
        parts = truecase.get_true_case(' '.join(lst)).split()

        # the trucaser have its own tokenization ...
        # skip if the number of word doesn't match
        if len(parts) != len(word_lst): return tokens

        for (w, idx), nw in zip(word_lst, parts):
            tokens[idx] = nw
    return tokens
Exemple #16
0
def get_entities(DocList):
    ListDoc = [Preprocessing(truecase.get_true_case(sent)) for sent in DocList]
    'https://www.geeksforgeeks.org/python-named-entity-recognition-ner-using-spacy/'

    black_list = ['CARDINAL', 'ORDINAL', 'DATE', 'ORG']

    nlp = spacy.load('en_core_web_sm')
    Entities = [[
        ent.text.lower() for ent in nlp(doc).ents
        if ent.label_ not in black_list and len(ent.text) > 2
    ] for doc in ListDoc]
    #Entities = [[(ent.text, ent.label_) for ent in nlp(doc).ents if ent.label_ not in black_list and len(ent.text)>2] for doc in ListDoc]
    Entities = list(
        set([
            ' '.join(List) for List in remove_stopwords(Entities)
            if len(List) > 0
        ]))

    return Entities
def extract_graph_feats(segments):
    """
    :param segments: List of text segments. Each segment is a string.
    :return: feats_dict: Dictionary mapping feature names to values
    """
    feats_dict = {}
    # if segments are all lowercase, try to infer true case (this helps POS detection)
    for idx, seg in enumerate(segments):
        if seg.islower():
            segments[idx] = truecase.get_true_case(seg)
    # break segments up into words
    #segments_mixed_case = [s.split(" ") for s in segments]
    segments_mixed_case = []
    for s in segments: 
        text = s.split(" ") 
        if '' in set(text):
            text[:] = [w for w in text if w != '']
        segments_mixed_case.append(text) 
    
    # also get lowercase version (used for naive graph)
    #segments_lower_case = [s.lower().split() for s in segments]
    segments_lower_case = []
    for s in segments: 
        text = s.lower().split() 
        if '' in set(text):
            text[:] = [w for w in text if w != '']
        segments_lower_case.append(text) 
    
    # build graphs
    naive_graph = create_naive_graph(segments_lower_case)
    lemma_graph = create_lemma_graph(segments_mixed_case)  # POS detection is used to help with lemmatization
    pos_graph = create_pos_graph(segments_mixed_case)
    # compute features for each graph
    get_graph_metrics(naive_graph, 'naive', feats_dict)
    get_graph_metrics(lemma_graph, 'lemma', feats_dict)
    get_graph_metrics(pos_graph, 'pos', feats_dict)
    # add normalized versions of features
    word_count = get_word_count(segments)
    add_norm_feats(feats_dict, word_count)
    return feats_dict
Exemple #18
0
def get_first_city(message: str) -> str:
    '''
  returns the first city in the message

  Arguments:
  - message: the message provided

  Returns:
  - a city if found else empty string
  '''

    corrected = correct(message)
    true_case = truecase.get_true_case(corrected)
    doc = nlp(true_case)

    for ent in doc.ents:
        if ent.label_ in ['GPE', 'PERSON']:
            city = gc.get_cities_by_name(ent.text)
            if city:
                return ent.text

    return ''
Exemple #19
0
def to_truecase(tokens):
    """
    # code from https://github.com/google-research/bert/issues/223#issuecomment-649619302

    # original tokens
    #['FULL', 'FEES', '1.875', 'REOFFER', '99.32', 'SPREAD', '+20', 'BP']

    # truecased tokens
    #['Full', 'fees', '1.875', 'Reoffer', '99.32', 'spread', '+20', 'BP']
    """
    word_lst = [(w, idx) for idx, w in enumerate(tokens) if all(c.isalpha() for c in w)]
    lst = [w for w, _ in word_lst if re.match(r'\b[A-Z\.\-]+\b', w)]

    if len(lst) and len(lst) == len(word_lst):
        parts = truecase.get_true_case(' '.join(lst)).split()

        # the trucaser have its own tokenization ...
        # skip if the number of word dosen't match
        if len(parts) != len(word_lst): return tokens

        for (w, idx), nw in zip(word_lst, parts):
            tokens[idx] = nw
Exemple #20
0
def get_Captions_Entities():
    Captions = {}
    nlp = spacy.load('en_core_web_sm')

    with open('Captions/Mira_ToBeLabeled.csv',
              newline='',
              encoding='ISO-8859-1') as f:  # encoding='ISO-8859-1'
        reader = csv.reader(f)
        i = 0
        for row in reader:
            if i == 0:
                i = 1
            else:
                #doc = Preprocessing(truecase.get_true_case(row[3]))
                doc = truecase.get_true_case(row[3])
                Entities = [ent.text.lower() for ent in nlp(doc).ents]
                Entities = list(
                    set([
                        ' '.join(List) for List in remove_stopwords(Entities)
                        if len(List) > 0
                    ]))
                Captions[row[2]] = Entities
    return Captions
Exemple #21
0
def create_doc(text):
    if TRUE_CASE:
        content = truecase.get_true_case(text).replace('\n', ' ')
    else:
        content = text.replace('\n', ' ')
    return nlp(content)
Exemple #22
0
 def normalizeText_duc05(self, multi_doc):
     # Added list of not_hyphenated words that should correct in titles
     not_hyphenated = [
         'COLOMBIA-L.A.', 'MIAMI-TO-L.A.', 'Turkey-Syria', 'VW-Opel', 'GM-VW', 'O.C.-L.A.',
         'Ever-more-complex', 'ARGENTINE-BRITISH', '46-MILLION', 'UK-Argentine', 'PRO-MARIJUANA',
         'test-case', '2.46-MILLION', 'plasma-based', 'JOB-DEATH', 'GOLD-MINING', '1.3-MILLION',
         '150-MILLION-A-YEAR', 'Atom-smasher', 'LITHIUM-WATER', 'NERVE-GAS', 'drug-makers',
         'EAST-WEST', '2-MILLION-SQUARE-MILE', 'Hume-Adams', 'BEAR-POACHERS', 'Robot-selected',
         'self-rule', 'Ulster-style', '71-MILLION', '1.5-MILLION', '12-MILLION', '7-MILLION',
         'Iran-U.S.'
     ]
     type_of_titles = ['HEAD', 'HL', 'HEADLINE', 'H3']
     task_name = splitext(basename(self.filesList[0]))[0]
     if not multi_doc:
         while True:
             try:
                 tree = ET.parse(self.filesList[0])
             except (FileNotFoundError, PermissionError):
                 self.filesList = (input(
                     "The path doesn\'t contain file\'s name. Please enter complete path of a file containing file\'s name:\n"),)
             except ET.ParseError:
                 with open(self.filesList[0]) as xmlFile:
                     xml_content = xmlFile.read()
                 xml_content = re.sub(
                     r'(=)([0-9]+)(>)', r'\1"\2"\3', xml_content)
                 xml_content = re.sub(r'&', r'&amp;', xml_content)
                 with open('{}.xml'.format(task_name), 'w') as new_xmlFile:
                     new_xmlFile.write(xml_content)
                 tree = ET.parse('{}.xml'.format(task_name))
                 remove('{}.xml'.format(task_name))
                 break
             else:
                 break
         root = tree.getroot()
         for child in root:
             if child.tag not in self._xmlPart:
                 self._xmlPart[child.tag] = child.text
             else:
                 pass
             for subChild in child:
                 if subChild.text is not None:
                     self._xmlPart[child.tag] += subChild.text
                 else:
                     pass
                 if child.tag == 'TEXT' and subChild.tag == 'F':
                     self._xmlPart[child.tag] += subChild.tail
         text = self._xmlPart.get('TEXT')
         _title = [value for key, value in self._xmlPart.items()
                   if key in type_of_titles]
         doc_title = ''
         if _title:
             doc_title += _title[0]
             doc_title = re.sub(r'\n\n', '\n', doc_title)
             doc_title = re.sub(r'([^.])\n', r'\1 ', doc_title)
             doc_title = re.sub(r'@', r'', doc_title)
             doc_title = re.sub(r' +----+ +.*', r'', doc_title)
             doc_title = re.sub(r'[``|"|:]', '', doc_title)
             doc_title = re.sub(r' +', ' ', doc_title)
             doc_title.strip()
     else:
         if len(self.filesList) == 1:
             while True:
                 try:
                     files = [file for file in listdir(self.filesList[0]) if isfile(
                         join(self.filesList[0], file))]
                 except NotADirectoryError:
                     self.filesList = (
                         input("You should enter a folder directory:\n"),)
                 else:
                     break
             text, doc_title = '', ''
             for file in files:
                 try:
                     tree = ET.parse(join(self.filesList[0], file))
                 except ET.ParseError:
                     with open(join(self.filesList[0], file)) as xmlFile:
                         xml_content = xmlFile.read()
                     xml_content = re.sub(
                         r'(=)([0-9]+)(>)', r'\1"\2"\3', xml_content)
                     xml_content = re.sub(r'&', r'&amp;', xml_content)
                     with open('{}.xml'.format(task_name), 'w') as new_xmlFile:
                         new_xmlFile.write(xml_content)
                     tree = ET.parse('{}.xml'.format(task_name))
                     remove('{}.xml'.format(task_name))
                 root = tree.getroot()
                 xml_part = dict()
                 self.find_tags_recursively(root, xml_part)
                 temp_text = xml_part.get('TEXT')
                 _title = [value for key, value in xml_part.items()
                           if key in type_of_titles]
                 title_text = ''
                 if _title:
                     title_text += _title[0].lstrip()
                     title_text = re.sub(r'([^.])\n', r'\1 ', title_text)
                     title_text = re.sub('FT.*[0-9]+ +/ +', '', title_text)
                     title_text = re.sub(r'--+', r'', title_text)
                     title_text = re.sub(r' +- *', r' ', title_text)
                     title_text = re.sub(r'(``|\'\'|")', r'', title_text)
                     # For removing ' from beginning of a quotation
                     title_text = re.sub(
                         r'(\s)\'(\w+)', r'\1\2', title_text)
                     title_text = re.sub(
                         r' *\(.*\)', r'', title_text, re.DOTALL)
                     title_text = re.sub(r'\n', r' ', title_text)
                     title_text = title_text.strip()
                     if file.startswith('LA'):
                         title_text = get_true_case(title_text)
                     doc_title += '{}\n'.format(title_text)
                 if temp_text:
                     # For removing complete tables from text
                     temp_text = re.sub(
                         r'( +-{4,}\n)(.+\n)*( +-{4,}\n*)', r'', temp_text)
                     temp_text = re.sub(r'([^.])\n', r'\1 ', temp_text)
                     temp_text = temp_text.strip()
                     text += '{}\n'.format(temp_text)
             doc_title = re.sub(r'\n\n', r'\n', doc_title)
             doc_title = re.sub(r'\n ', r'\n', doc_title)
             hyphen_words = re.findall(
                 r'\s*(?:{})(?:\.|,|!|\?|/|\'|\s+)'.format('|'.join(not_hyphenated)), doc_title, re.I)
             if hyphen_words:
                 hyphen_dict = dict()
                 for word in hyphen_words:
                     hyphen_dict.update({word: word.replace('-', ' ')})
                 hyphen_dict = dict((re.escape(k), v)
                                    for k, v in hyphen_dict.items())
                 pattern = re.compile("|".join(hyphen_dict.keys()))
                 doc_title = pattern.sub(
                     lambda m: hyphen_dict[re.escape(m.group(0))], doc_title)
             doc_title = doc_title.replace('POP/ROCK', 'POP ROCK')
             doc_title = doc_title.replace('/LOCAL', 'LOCAL')
     text = text.strip()
     text = re.sub(r'\.( \"[A|a]nd)', r'\1', text)
     text = re.sub(r'(\.\.\.|\. \. \.)', r'', text)
     text = re.sub(r'--', r'', text)
     text = re.sub(r'([A-Z.][A-Z.])( +\n+ +)', r'\1 .\2', text)
     text = re.sub(r'(Inc\.)( +\n\n+ +)', r'\1 .\2', text)
     text = re.sub(r'\n\n', '\n', text)
     text = re.sub(r'.(\" [a-zA-Z0-9]* said.)', r',\1', text)
     text = re.sub(r'(``|\'\'|")', r'', text)
     # for removing ' from beginning of a quotation
     text = re.sub(r'(\s)\'(\w+)', r'\1\2', text)
     text = re.sub(r' \. \. \.', r'', text)
     text = re.sub(r'(.)\s+(\n)', r'\1\2', text)
     text = re.sub(r'([^.])\n', r'\1 ', text)
     text = re.sub(r'^.*\[Text\] ', r'', text)
     text = re.sub(r'\[.*\] ', r'', text)
     text = re.sub(r'(\n) (\S)', r'\1\2', text)
     text = re.sub(r'(AG|GM|VW|Volkswagen|and|he|his|Essex)/'
                   '(GM|Opel|General|or|she|her|London)', r'\1 \2', text)
     return text, doc_title, task_name
Exemple #23
0
def generateReply(body, db_message):
    if not current_user.is_authenticated:
        return jsonify({"status": "Page Blocked", "authenticated": False})
    # return jsonify({
    #  "chatbot_response" : {
    #      "body": "Response",
    #      "timestamp": 10,
    #      "order": 2
    #  },
    #  "grammar_correction" : {
    #         "body": "Response",
    #         "timestamp": 10,
    #         "order": 2
    #     }
    # })

    user = current_user
    message = addPunctuation(
        spell_checker.correct_sentence(truecase.get_true_case(
            body)))  # fix capitalization, spelling, and punctuation
    chatbot_body = ''
    print('Before Prediction')
    print('NUMBER OF THREADS: ', threading.active_count())
    if ('bye' in message.lower()):
        chatbot_body = 'See you later!'
    else:
        chatbot_body = chatbot.predictResponse(context=message)
    print('DOne Predicting')
    chatbot_body = truecase.get_true_case(chatbot_body)
    grammar_correction_response = grammar_checker.check_grammar(
        input_sentence=message)
    stripped_message = stripChars(message)
    stripped_correction = stripChars(grammar_correction_response)
    grammar_body = ''
    if stripped_message == stripped_correction or len(message.split(' ')) <= 2:
        user.userData[0].correctSentences += 1
        db_message.correct = 1
    else:
        formatted_grammar_response = truecase.get_true_case(
            grammar_correction_response)
        grammar_body = 'Did you mean: ' + formatted_grammar_response
        db_message.correct = 0
    order = 2
    chatbot_response = Message(body=chatbot_body, author=user, order=order)

    db.session.add(chatbot_response)
    if grammar_body != '':
        grammar_correction = Message(body=grammar_body,
                                     author=user,
                                     order=order)
        db.session.add(grammar_correction)
    db.session.commit()
    return jsonify({
        "chatbot_response": {
            "body": chatbot_response.body,
            "timestamp": chatbot_response.timestamp,
            "order": chatbot_response.order,
        },
        "grammar_correction": {
            "body": grammar_body,
            "timestamp":
            0 if grammar_body == '' else grammar_correction.timestamp,
            "order": 0 if grammar_body == '' else grammar_correction.order,
        }
    })
Exemple #24
0
 def _truecase_hook(raw):
     return truecase_.get_true_case(raw, out_of_vocabulary_token_option=oov)
    temp = np.load('mapping/layer-8/europarl-v7.%s-%s.%s.GBDD' %
                   (src, tgt, args.layer),
                   allow_pickle=True)
    bias = torch.tensor(temp, dtype=torch.float).to(device)

    data = pd.read_csv(os.path.join('WMT17', 'testset', path), sep='\t')
    references = data['reference'].tolist()
    translations = data['translation'].tolist()
    source = data['source'].tolist()
    human_score = data['HUMAN_score'].tolist()
    sentBLEU = data['sentBLEU'].tolist()

    with MosesDetokenizer(src) as detokenize:
        source = [detokenize(s.split(' ')) for s in source]
    with MosesDetokenizer(tgt) as detokenize:
        references = [detokenize(s.split(' ')) for s in references]
        translations = [detokenize(s.split(' ')) for s in translations]

    translations = [truecase.get_true_case(s) for s in translations]

    xmoverscores = scorer.compute_xmoverscore(args.alignment, projection, bias, source, translations, ngram=args.ngram, \
                                              layer=args.layer, dropout_rate=args.dropout_rate, bs=args.batch_size)

    lm_scores = scorer.compute_perplexity(translations, bs=1)

    scores = metric_combination(xmoverscores, lm_scores, [1, 0.1])

    print('\r\nlp:{} xmovescore:{} xmoverscore+lm:{}'.format(
        lp, pearson(human_score, xmoverscores), pearson(human_score, scores)))
Exemple #26
0
    def geoparsing(self,
                   text,
                   case_correct=False,
                   limit=5,
                   gazetteer_cg=False):
        """
        Method that performs the geoparsing of text,

        NOTE: use the geoparsing without the correct case and withour
        the gazetteer will give you poor results.

        Params:
        ----------
        text : String
            - Text that to performs the geoparsing.
        case_correct: Bool
            - If the text is with correct case.
        limit: Int
            - Maximum limit of returned addresses.
        gazetteer_cg: Bool
            - If you want to use the gazetteer with locations in
            the state of Paraíba.

        Return:
        ----------
        result : List
            - List of addresses.
        """
        if gazetteer_cg:
            result = self.filter_address_text(text.lower())
            if result:
                return result
            else:
                raise Exception("Text geoparsing could not be performed")
        else:
            if case_correct:
                doc = self.nlp(text)
                ents_loc = list(
                    filter(
                        lambda entity: entity.label_ == "LOC" or entity.label_
                        == "GPE", doc.ents))
                address_found = self.concantenate_address(ents_loc)
                result = self.check_address(address_found, limit)
                if result[0]:
                    return result[1]
                else:
                    raise Exception("Text geoparsing could not be performed")
            else:
                text = truecase.get_true_case(text)

                text_en = self.translator.translate(text, dest="en")
                text_en = text_en.text
                text_true_case = truecase.get_true_case(text_en)

                text_pt = self.translator.translate(text_true_case,
                                                    src="en",
                                                    dest="pt")
                text = text_pt.text

                doc = self.nlp(text)
                return self.geoparsing(text, case_correct=True)
def tc(string):
    string = truecase.get_true_case('A ' + string)
    return string[2:]
def tc(string):
    return truecase.get_true_case('A ' + string)[2:]
Exemple #29
0
def preprocess_text (inputText):
    clean_text = ''    
    clean_text = re.sub(' +', ' ', inputText)    # remove double spaces    
    clean_text = contractions.fix(clean_text)    # contraction
    clean_text = truecase.get_true_case(clean_text)
    return clean_text
Exemple #30
0
import truecase

fi = open("allqueries.txt", "r")
fo = open("allQueries.txt", "w")
for q in fi:
    fo.write(truecase.get_true_case(q))
    fo.write("\n")
fo.close()
fi.close()