def _id_field_iter(text): current_id = None for match in re.finditer(r'^(\w+)\s+=\s+\{\s*(.*?)\s*\}$', text, re.MULTILINE | re.DOTALL): key, value_bytes = match.groups() # &#X only happens once, but it's wrong and ftfy doesn't treat it the same way as &#x value_bytes = value_bytes.replace('&#X', '&#x') # same for ,, -- only happens once value_bytes = value_bytes.replace(',,', ',') # reverse line feed? no thanks value_bytes = value_bytes.replace('', '') # I don't even know... value_bytes = value_bytes.replace('&#scaron;', 'š') # you'd think that ;; sequences would make sense -- a non-final author # with a first name ending in an html entity, but this actually fixes # more things than it breaks (need to address the few cases where it does break) value_bytes = value_bytes.replace(';;', ';') # UTF-8 is a better first guess, but will break on some of the input try: value_unicode = value_bytes.decode('UTF-8') except UnicodeDecodeError: value_unicode = value_bytes.decode('ISO-8859-2') value = fix_text(value_unicode, fix_entities=True, normalization='NFKC') # ftfy docs says it will repeat if needed, but it doesn't? value = fix_text(value, fix_entities=True, normalization='NFKC') if key == 'id': current_id = value yield current_id, (key, value)
def test_real_text(): """ Test with text actually found in the wild (mostly on Twitter). I collected test cases by listening to the Twitter streaming API for a million or so tweets, picking out examples with high weirdness according to ftfy version 2, and seeing what ftfy decoded them to. There are some impressive things that can happen to text, even in an ecosystem that is supposedly entirely UTF-8. TEST_CASES contains the most interesting examples of these, often with some trickiness of how to decode them into the actually intended text. For some reason, sampling Twitter gives no examples of text being accidentally decoded as Windows-1250, even though it's one of the more common encodings and this mojibake has been spotted in the wild. It may be that Windows-1250 is used in places that culturally don't use Twitter much (Central and Eastern Europe), and therefore nobody designs a Twitter app or bot to use Windows-1250. I've collected a couple of examples of Windows-1250 mojibake from elsewhere. """ for orig, target in TEST_CASES: # make sure that the fix_encoding step outputs a plan that we can # successfully run to reproduce its result encoding_fix, plan = fix_encoding_and_explain(orig) eq_(apply_plan(orig, plan), encoding_fix) # make sure we can decode the text as intended eq_(fix_text(orig), target) # make sure we can decode as intended even with an extra layer of badness extra_bad = orig.encode('utf-8').decode('latin-1') eq_(fix_text(extra_bad), target)
def test_real_tweets(): """ Test with text actually found on Twitter. I collected these test cases by listening to the Twitter streaming API for a million or so tweets, picking out examples with high weirdness according to ftfy version 2, and seeing what ftfy decoded them to. There are some impressive things that can happen to text, even in an ecosystem that is supposedly entirely UTF-8. The tweets that appear in TEST_CASES are the most interesting examples of these, with some trickiness of how to decode them into the actually intended text. """ for orig, target in TEST_CASES: # make sure that the fix_encoding step outputs a plan that we can # successfully run to reproduce its result encoding_fix, plan = fix_encoding_and_explain(orig) eq_(apply_plan(orig, plan), encoding_fix) # make sure we can decode the text as intended eq_(fix_text(orig), target) # make sure we can decode as intended even with an extra layer of badness extra_bad = orig.encode('utf-8').decode('latin-1') eq_(fix_text(extra_bad), target)
def _process_tag(story): tags = story.find("Editor_Tags") return { "date": datetime.strptime(story.get("Date"), "%d %b %Y"), "story_id": int(story.get("StoryId")), "headline": fix_text(unicode(story.find("Headline").text)), "text": fix_text(unicode(story.find("Story_text").text)), "source": story.find("Source").text, "tags": {int(e.attrib.get("OrgID")): e.attrib.get("Relevance") for e in tags} if tags is not None else [], }
def test_entities(): example = '&\n<html>\n&' eq_(fix_text(example), '&\n<html>\n&') eq_(fix_text_segment(example), '&\n<html>\n&') eq_(fix_text(example, fix_entities=True), '&\n<html>\n&') eq_(fix_text_segment(example, fix_entities=True), '&\n<html>\n&') eq_(fix_text(example, fix_entities=False), '&\n<html>\n&') eq_(fix_text_segment(example, fix_entities=False), '&\n<html>\n&') eq_(fix_text_segment('<>', fix_entities=False), '<>') eq_(fix_text_segment('<>', fix_entities=True), '<>') eq_(fix_text_segment('<>'), '<>')
def check_ftfy(self, text, encoding_only=True): """ Given a single text input, check whether `ftfy.fix_text_encoding` would change it. If so, display the change. """ self.count += 1 text = unescape_html(text) if not possible_encoding(text, 'ascii'): if encoding_only: fixed = fix_encoding(text) else: fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False) if text != fixed: # possibly filter common bots before printing print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format( text=text, fixed=fixed )) self.num_fixed += 1 elif 'â€' in text or '\x80' in text: print('\nNot fixed:\t{text!r}'.format(text=text)) # Print status updates once in a while if self.count % 100 == 0: print('.', end='', flush=True) if self.count % 10000 == 0: print('\n%d/%d fixed' % (self.num_fixed, self.count))
def __init__(self, url): """ Init obj """ super(WatchHerePage, self).__init__(url) if not self.is_watch_here_page(): raise PageTypeError('Not a Watch Here page') self.desc = ftfy.fix_text(self.entry.p.text) self.watch_here_link = get_abs_ct_url(self.entry.a['href'])
def standardize_as_list(text, token_filter=None): """ Get a list of tokens or stems that appear in the text. `token_filter` is an optional function to apply to the list of tokens, performing language-specific lemmatization and stopword removal. In practice, the only such filter is for English. >>> standardize_as_list('the dog', token_filter=english_filter) ['dog'] >>> standardize_as_list('big dogs', token_filter=english_filter) ['big', 'dog'] >>> standardize_as_list('big dogs') ['big', 'dogs'] >>> standardize_as_list('to go', token_filter=english_filter) ['go'] >>> standardize_as_list('the', token_filter=english_filter) ['the'] >>> standardize_as_list('to', token_filter=english_filter) ['to'] """ text = fix_text(text) tokens = [token for token in simple_tokenize(text)] if token_filter is not None: tokens = token_filter(tokens) return tokens
def tokenizer(text, tokenizer_fn, to_lower=False): text = ftfy.fix_text(text) if to_lower: text = text.lower() try: seq = Sequence(text.strip()) except ValueError: return tokens = tokenizer_fn.transform(seq) new_tokens = [] for token in tokens: if token.strip() == "": continue elif PUNCTSYM.search(token): token = "$" elif LIKENUM.search(token): token = "0" elif LIKEUNIT.search(token): token = LIKEUNIT.sub(r"0 \1", token) elif token == "can't": token = "can not" elif CONTRACTION1.search(token): token = CONTRACTION1.sub(r"\1 '\2", token) elif CONTRACTION2.search(token): token = CONTRACTION2.sub(r"\1 n't", token) new_tokens.append(token) if new_tokens: return " ".join(new_tokens).strip() return
def read_clusters(clusters_loc): clusters = {} if ftfy is None: user_warning(Warnings.W004) with clusters_loc.open() as f: for line in tqdm(f): try: cluster, word, freq = line.split() if ftfy is not None: word = ftfy.fix_text(word) except ValueError: continue # If the clusterer has only seen the word a few times, its # cluster is unreliable. if int(freq) >= 3: clusters[word] = cluster else: clusters[word] = "0" # Expand clusters with re-casing for word, cluster in list(clusters.items()): if word.lower() not in clusters: clusters[word.lower()] = cluster if word.title() not in clusters: clusters[word.title()] = cluster if word.upper() not in clusters: clusters[word.upper()] = cluster return clusters
def get_article_sentences(article): subs = {'Sen.':'Senator','Lt. Gov.':'Lieutenant Governor','Rep.':'Representative','Reps.':'Representatives,', 'Gov.':'Governor'} if 'body' in article: text = fix_text(article['body']).replace('?"', '? "').replace('!"', '! "').replace('."', '. "') for a in subs: text = text.replace(a,subs[a]) sentences = sentence_splitter.tokenize(text) return sentences elif 'text' in article: text = fix_text(article['text']).replace('?"', '? "').replace('!"', '! "').replace('."', '. "') for a in subs: text = text.replace(a,subs[a]) sentences = sentence_splitter.tokenize(text) return sentences return []
def buildHTML(room, date, timezone): datestring = str(date.month) +"/" + str(date.day) + "/" + str(date.year) messages = msgByDate(room, date,timezone) roomtitle = room.title if len(messages) == 0: print "no messages to send, exiting" sys.exit(0) else: for message in reversed(messages): if u'text' in message.keys(): message['localmsgtime'] = shiftToLocal(message[u'created'], timezone) message['timestamp'] = timeFixUp(message['localmsgtime'].hour) + ":" + timeFixUp(message['localmsgtime'].minute) + ":" + timeFixUp(message['localmsgtime'].second) message['displayname'] = getDisplayName(message['personId'], room.users) ##Need to work on this more. Need to account for Ellips before we ignore it. Need to figure out bullets ##For now this will just ignore thigns that don't directly convert to ascii message[u'text'] = message[u'text'].encode("ascii","ignore") message[u'text'] = message[u'text'].decode("utf-8") message[u'text'] = fix_text(message[u'text'], normalization="NFKC") env = Environment(loader=PackageLoader('sparkdaily', 'templates')) template = env.get_template('newsletter.html') html = template.render(roomtitle=roomtitle, messages=reversed(messages), datestring=datestring) return html
def transcribeStory(story:list): """ Takes a list of dicts that contain the story text and metadata and transcribes it into the formatted book. """ flushBook() #renderer = mistune.Renderer(escape=True, hard_wrap=True) #markdown = mistune.Markdown(renderer = renderer) for t in story: text_translate_table[(t['type'])](t) for c in story: try: addCitation([c['cite']]) except KeyError as err: continue result = readBook() with open("output.markdown", mode='wt', encoding="utf-8") as file: file.write(result) renderer = mistune.Renderer(escape=True, hard_wrap=False) markdown = mistune.Markdown(renderer = renderer) htmltext = ftfy.fix_text(markdown(result)) with open("output.html", mode='wt', encoding="utf-8") as file: file.write(htmltext) return result
def read_values(filename, cutoff=0, max_size=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. If `cutoff` is greater than 0, the csv file must be sorted by value in descending order. If `lang` is given, it will apply language-specific tokenization to the words that it reads. """ values = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) if val < cutoff or len(values) >= max_size: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing values[token] += val total += val return values, total
def tidy(self): tidied = [] for obj in Lexicon.objects.all(): new = fix_text(obj.entry.strip(), fix_entities=True, normalization="NFKC", uncurl_quotes=True) if obj.entry != new: tidied.append((obj, new)) return tidied
def webfix_unicode(possible_string): """ This is ugly but it will create Times-approved HTML out of terrible cut-and-paste from decision text. """ CHAR_MAP = [ (u'\xa7', u'§'), (u'\u2014', u'—'), (u'\u2013', u'–'), (u'\x97', u'—'), (u'\xa4', u'€'), (u'\u201c', u'"'), (u'\u201d', u'"'), (u'\x96', u'–'), ] if isinstance(possible_string, basestring): string = possible_string string = string.strip() for char, replace_char in CHAR_MAP: string = string.replace(char, replace_char) string = string.decode('utf-8') string = unicode(string) string = ftfy.fix_text(string) string = smartypants.smartypants(string) return string return possible_string
def preprocess_text(text): """ Given any basestring as input, make its representation consistent: - Ensure that it is a Unicode string, converting from UTF-8 if necessary. - Detect whether the text was incorrectly encoded into UTF-8 and fix it, as defined in `fix_bad_unicode`. - Replace HTML entities with their equivalent characters. - Replace newlines and tabs with spaces. - Remove all other control characters. - Normalize it with Unicode normalization form KC, which applies the following relevant transformations: - Combine characters and diacritics that are written using separate code points, such as converting "e" plus an acute accent modifier into "é", or converting "ka" (か) plus a dakuten into the single character "ga" (が). - Replace characters that are functionally equivalent with the most common form: for example, half-width katakana will be replaced with full-width, and full-width Roman characters will be replaced with ASCII characters. """ if isinstance(text, str): text = text.decode('utf-8') return fix_text(text)
def test_json_example(test_case): # Run one example from the data file orig = test_case['original'] fixed = test_case['fixed'] # Make sure that the fix_encoding step outputs a plan that we can # successfully run to reproduce its result encoding_fix, plan = fix_encoding_and_explain(orig) assert apply_plan(orig, plan) == encoding_fix # Make sure we can decode the text as intended assert fix_text(orig) == fixed assert encoding_fix == test_case.get('fixed-encoding', fixed) # Make sure we can decode as intended even with an extra layer of badness extra_bad = orig.encode('utf-8').decode('latin-1') assert fix_text(extra_bad) == fixed
def encode(self, texts, verbose=True): texts_tokens = [] if verbose: for text in tqdm(texts, ncols=80, leave=False): text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) else: for text in texts: text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) return texts_tokens
def smart_unicode(text): if text: if not isinstance(text, unicode): text = unicode(text, "utf8") text = ftfy.fix_text(text) return text
def __call__(self, si, context): if not si.body or not si.body.raw: return si read_from = getattr(si.body, self.config['read_from']) setattr(si.body, self.config['write_to'], ftfy.fix_text(read_from.decode('utf-8')).encode('utf-8')) return si
def standardize(text, lang='en', remove_accents=True): text = fix_text(text) if remove_accents and (lang=='es' or text.startswith('/c/es/')): text = normalize('NFD', text).encode('ascii', errors='ignore').decode() if text.startswith('/c/'): return replace_numbers(text) else: return standardized_concept_uri(text, lang)
def parse(self): """ Removes all unicode characters, nonprintable characters, and unneeded special characters. This formats the text for audio reading. """ try: # Attempt to scrub the unicode with a library text = ftfy.fix_text(self.text) self.text = unidecode.unidecode(text).replace('[?]', '') except Exception: # If that fails, kill it with fire. print("Nuking the text.") text = bytes(self.text, 'utf-8') text = text.decode('unicode_escape') text = text.encode('ascii', 'ignore') text = text.decode('utf-8') self.text = str(text) try: # Try to translate the story into the reader's language if self.language != language: self.translate(language) except: pass # Formats text to remove odd artifacts from the conversion self.changes.update({ '\n': ' ', '\r': ' ', '"': "'", '.': '. ', '. . . ': '', '. . .': '...', "\'": "'", '\"': '', ':': ': ', ': ': ': ', '!': '! ', '! ': '! ', '?': '? ', '? ': '? ', ';': '; ', '; ': '; ', '0': '0 ', '1': '1 ', '2': '2 ', '3': '3 ', '4': '4 ', '5': '5 ', '6': '6 ', '7': '7 ', '8': '8 ', '9': '9 ' }) if self.speech == 'local': # The Microsoft SAPI pronunciation is a bit off self.changes.update({ 'Tali': 'Tahlie', 'tali': 'tahlie', 'Yalo': ' Yah-lo ', 'caf ': 'cafe ', 'Garrus': 'Gae-rrus', 'Klenon': 'Klenn une', 'Binary': 'Bi-nary', 'Noveria': ' No-veir-eaah ', 'Vakarian': 'Vah-kare-eean' }) else: # Google's TTS is better at its job :) self.changes.update({ 'Tali': 'Tahhlee', 'tali': 'Tahhlee', 'caf ': 'cafe ' }) # Apply the changes to the text. for original_word, changed_word in self.changes.items(): self.text = self.text.replace(original_word, changed_word)
def putline(self, line): line = line + '\r\n' if self.debugging > 1: print('*put*', self.sanitize(line)) # FORCE the line to ALWAYS be utf-8. line = ftfy.fix_text(line) line = line.encode("UTF-8") self.sock.sendall(line)
def __init__(self, text, frenchspacing=False, normalise=False): self.data = text if normalise: self.data = ftfy.fix_text(self.data) if not frenchspacing: self._sentence_to_interstitial_spacing() self._interstitial_to_sentence_spacing() self._latex_symbols() self._hyphens_to_dashes()
def clean_unicode(possible_string): if isinstance(possible_string, basestring): string = possible_string string = string.strip() string = string.decode('utf-8') string = unicode(string) string = ftfy.fix_text(string) return string return possible_string
def standardized_concept_uri(text, lang='en'): text = fix_text(text) tokens = simple_tokenize(text) if lang == 'en': tokens = english_filter(tokens) tokens = [replace_numbers(token) for token in tokens] slug = replace_numbers('_'.join(tokens)) return '/'.join(['/c', LCODE_ALIASES.get(lang, lang), slug])
def txt2words(self, txt, remove_stopwords=True): txt = BeautifulSoup(txt).get_text() txt = ftfy.fix_text(txt) txt = txt.replace("\\n", '') txt = re.sub("[^0-9a-zA-Z]"," ", txt) if remove_stopwords: words = [self.save_stem(w) for w in txt.lower().split() if (w not in self.stopwords) & (len(w) > 2) & (not w.isdigit())] else: words = [self.save_stem(w) for w in txt.lower().split() if (len(w) > 2) & (not w.isdigit())] return words
def make_ascii_friendly(text): return ftfy.fix_text(text) LISTING = '/Users/Kristen/PycharmProjects/proj/Other/data/current_dists.txt'
def _deserialize(self, value, attr, data): """Deserialize sanitized string value.""" value = super(SanitizedUnicode, self)._deserialize(value, attr, data) value = fix_text(value) # NOTE: This `join` might be ineffiecient... There's a solution with a # large compiled regex lying around, but needs a lot of tweaking. value = ''.join(filter(self.is_valid_xml_char, value)) for char in self.UNWANTED_CHARACTERS: value = value.replace(char, '') return value
def generate_corpus_for_quality_evaluation(k, pz_d, tweets, topic_words_distribution): all_tweets = [] logger.info(k) df = pd.read_csv(tweets, encoding='utf-8') for index, row in df.iterrows(): all_tweets.append(row['tweets']) tweets_pz_d = [] with open(pz_d) as f: for l in f: line = l.strip().split(' ') tweets_pz_d.append([float(p) for p in line]) results = {} for j in range(len(tweets_pz_d)): if 'nan' not in tweets_pz_d[j] and '-nan' not in tweets_pz_d[j]: sorted_pz_ds = list(tweets_pz_d[j]) sorted_pz_ds.sort(reverse=True) topic_id = tweets_pz_d[j].index(sorted_pz_ds[0]) if topic_id not in results: results[topic_id] = [all_tweets[j]] else: results[topic_id].append(all_tweets[j]) final_result = [] for tp in results: for keyword in topic_words_distribution[tp][1]: temp = [] dedup = set() for tweet in results[tp]: if '%s' % keyword[0] in tweet.lower(): clean_text_list = (common.cleanhtml( common.remove_username( common.remove_url(ftfy.fix_text( tweet.lower()))))).strip(' ').replace( '\n', ' ').split(' ')[:-1] clean_text = ",".join(str(x) for x in clean_text_list) if clean_text not in dedup: temp.append(tweet) dedup.add(clean_text) # samples_number = random.sample(range(1, len(temp)), 1) # if (tp == 6) and (keyword[0] == 'u.s.'): # logger.info(temp) # quit() samples_number = [] if len(temp) <= 2: samples_number = range(len(temp)) else: samples_number = random.sample(range(1, len(temp)), 2) for i in samples_number: result = {} result['topic_id'] = tp result['keyword'] = keyword[0] result['propability'] = keyword[1] result['tweet'] = temp[i] final_result.append(result) to_csv( final_result, '../../papers/2017_BMC_HPV/analysis/BTM/quality_evaluation/' + str(k) + 'tp.csv')
def _text_from_page(self, page: fitz.Page) -> str: bloks = page.get_text_blocks() text = [blk[4].replace("\n", " ") for blk in bloks if blk[-1] == 0] text = "\r\n".join(text) return ftfy.fix_text(text, normalization="NFKC")
def remove_specific_stop(words): punct = ['%', ',', '/', '(', ')', '.'] # frequent punctuation terms inside strings or digits for p in punct: words = words.replace(p, ' ') words = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", words) return words texts, article = [], [] texts_txt = '' for year in years_available: file_list = glob.glob("sources_data/{}/*.txt".format(year)) for f in file_list: words = open(f).read() words = fix_text(words) # Fix any unicode problem words = words.replace('\n', ' ').replace('\r', '') # remove line breaks words = remove_specific_stop(words) words = gb_to_us(words) if(len(words.split()) >= 30): # Only abstracts with at least 30 words nlp_words = nlp(words) for word in nlp_words: if not is_noise(word): article.append(word.lemma_) texts.append(article) texts_txt = texts_txt + ' '.join(article) + '\n' article = [] with open("{}{}.pickle".format(DATA_CLEAN, year), "wb") as fp: pickle.dump(texts, fp)
except Exception as e: syslog.syslog(str(e)) stdin_used = True #except Exception as e: # if debug: # syslog.syslog("FATAL ERROR: Not all required input received") # print(str(e)) # syslog.syslog(str(e)) # sys.exit(1) #if debug: # syslog.syslog("Encoding of subject: {0}".format(ftfy.guess_bytes(email_subject)[1])) # syslog.syslog("Encoding of body: {0}".format(ftfy.guess_bytes(email_data)[1])) try: email_data = ftfy.fix_text(email_data.decode("utf-8", "ignore")) except: email_data = ftfy.fix_text(email_data) try: email_subject = ftfy.fix_text(email_subject.decode("utf-8", "ignore")) except: email_subject = ftfy.fix_text(email_subject) if debug: syslog.syslog(email_subject) syslog.syslog(email_data) misp_url = config.misp_url misp_key = config.misp_key misp_verifycert = config.misp_verifycert
import pandas as pd import numpy as np data_path = '/Users/fredde/Database/' df_load = pd.read_hdf(data_path + 'all_data_1year_comp.h5', 'table') df_load.head() import ftfy a = list(df_load) for i in range(len(a)): a[i] = ftfy.fix_text(a[i]) print(a) df_load[list(df_load)[1]].plot() headers = open(data_path + 'headers.csv', 'w') for item in a: headers.write('\n' + str(item.encode('utf-8')))
def fix_text_encoding(sentence: str): return fix_text(sentence)
for record in f: # We convert into UTF8 first of all orig_encoding, text = convert_encoding(record.payload.read()) url = record.url if orig_encoding is None: logging.info("Encoding of document " + url + " could not be identified") if len(text) > 0: # HTML is then normalized cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) tree="" try: cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE)) document = html5lib.parse(ftfy.fix_text(cleanhtml), treebuilder="lxml", namespaceHTMLElements=False) tree = etree.tostring(document) except: continue tree = etree.tostring(document) cleantree = tree.decode("utf8").replace(" ", " ") cleantree = cleantree.replace("\t", " ") # lang id lang = guess_lang_from_data2(cleantree) if len(languages) > 0 and lang not in languages: logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.") else: # If enabled, remove boilerplate HTML if options.boilerpipe:
def clean_wp_token(self, text): text = text.replace("\u0120", "", 1) text = text.replace("\u010a", "", 1) text = ftfy.fix_text(text) text = clean_extended_unicode(text) return text.strip()
def clean_wp_token(self, text): text = ftfy.fix_text(text) text = clean_extended_unicode(text) text = self._replace_re.sub("", text) return text.replace("</w>", "").strip()
def _standardize(text): return text_standardize(ftfy.fix_text(text))
def pre_processing_dataset(self): self.tweets = pd.read_csv("mydatasetnew.csv", usecols=['author', 'sentence', 'type']) df = pd.read_csv("contractions.csv", usecols=['col1', 'col2']) contractions_dict = dict(zip(list(df.col1), list(df.col2))) self.sentence_list, self.type_list, self.author_list = [], [], [] c_re = re.compile('(%s)' % '|'.join(contractions_dict.keys())) def expand_contractions(text, c_re=c_re): def replace(match): return contractions_dict[match.group(0)] return c_re.sub(replace, text) self.word2vec = KeyedVectors.load_word2vec_format( "word2vec_twitter_tokens.bin", unicode_errors='ignore', binary=True) count = Counter() for author, sentence, type in zip(self.tweets['author'], self.tweets['sentence'], self.tweets['type']): if re.match("(\w+:\/\/\S+)", sentence) == None: sentence = ' '.join( re.sub( "(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", sentence).split()) author = ' '.join( re.sub( "(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", author).split()) sentence = re.sub('<.*?>', '', sentence) author = re.sub('<.*?>', '', author) sentence = ftfy.fix_text(sentence) author = ftfy.fix_text(author) sentence = expand_contractions(sentence) author = expand_contractions(author) sentence = ' '.join( re.sub("([^0-9A-Za-z \t])", " ", sentence).split()) author = ' '.join( re.sub("([^0-9A-Za-z \t])", " ", author).split()) stop_words = set(stopwords.words('english')) word_tokens = nltk.word_tokenize(sentence) filtered_sentence = [ w for w in word_tokens if not w in stop_words and w in self.word2vec.vocab ] print(filtered_sentence) count.update(filtered_sentence) self.sentence_list.append(filtered_sentence) self.type_list.append(type) self.author_list.append(author) self.clean_tweets_dict = { j[0]: i for i, j in enumerate(count.most_common(12000)) } self.clean_tweets_dict['UNK'] = 12001 self.clean_tweets_dict['PAD'] = 12002 pickle.dump(self.clean_tweets_dict, open('word_dictionary.pkl', 'wb')) self.spliting_data() self.build_word_embedding_matrix() self.build_model()
def clean_html_string(x): return ftfy.fix_encoding( ftfy.fix_text(x.replace("\n", "").replace("\t", "").strip(), normalization='NFKC'))
def _fix_data(self, data, fix=True): if fix: return ftfy.fix_text(unicode(data, errors='ignore')) return data
# Open the file to output the data to file_id = 0 file = open_file(file_id) count = 0 for doc in cursor: # Create a dictionary of values based on the field names row_input = { 'mid': str(doc['_id']), # mongo id 'tid': doc['id_str'], # tweet id 'text': ftfy.fix_text(doc['text']), 'date': doc['created_at'], 'ts': doc['timestamp_ms'], # timestamp 'rt_text': ftfy.fix_text(doc.get('retweeted_status', {'text': ''})['text']), 'rt_user': doc.get('retweeted_status', {'user': { 'screen_name': '' }})['user']['screen_name'], 'user': doc['user']['screen_name'], # the user 'hashtags': [t['text'] for t in doc['entities']['hashtags']], 'urls': [t['expanded_url'] for t in doc['entities']['urls']], 'mentions':
def encode(self, document): document = self.nlp(self.standardize_text(ftfy.fix_text(document))) document_tokens = [] for token in document: document_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) return document_tokens
def gen_tweets(tweets, retweets, notext, adddot, maxpages): r = session.get(url, headers=headers) pages = maxpages json = r.json() # if no number of tweets specified, all tweets from the json will be returned found = tweets or json['new_latent_count'] - 1 while pages > 0 and found > 0: json = r.json() try: html = HTML(html=json['items_html'], url='bunk', default_encoding='utf-8') except KeyError: raise ValueError( f'Oops! Either "{user}" does not exist or is private.') comma = "," dot = "." tweets = [] for tweet in html.find('.stream-item'): data = tweet.find('.tweet-text') if len(data) < 1: continue raw = tweet.find('.tweet-text')[0].raw_html text = tweet.find('.tweet-text')[0].full_text text = re.sub('\Shttp', ' http', text, 1) text = re.sub('.@', ' @', text) remove = 'pic.twitter.com' removelen = len(remove) + 11 index = text.find(remove) while index > -1: text = text[0:index] + text[index + removelen:] index = text.find('pic.twitter.com') text = text.replace(u'\xa0', u' ') text = re.sub('[ \t\f\v]+', ' ', text) # fixes common encoding problems in the tweet text body text = fix_text(text.strip()) tweetId = tweet.find( '.js-permalink')[0].attrs['data-conversation-id'] originaluserId = tweet.find( '.js-original-tweet')[0].attrs['data-screen-name'] time = datetime.fromtimestamp( int(tweet.find('._timestamp')[0].attrs['data-time-ms']) / 1000.0) time = time.strftime("%Y-%m-%d %H:%M:%S") interactions = [ x.text for x in tweet.find('.ProfileTweet-actionCount') ] replies = interactions[0].split(" ")[0].replace( comma, "").replace(dot, "") or "0" retweets = interactions[1].split(" ")[0].replace( comma, "").replace(dot, "") or "0" likes = interactions[2].split(" ")[0].replace( comma, "").replace(dot, "") or "0" hashtags = [ hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag') ] urls = [ url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)') ] photos = [ photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer') ] videos = [] video_nodes = tweet.find(".PlayableMedia-player") for node in video_nodes: try: styles = node.attrs['style'].split() for style in styles: if style.startswith('background'): tmp = style.split('/')[-1] video_id = tmp[:tmp.index('.jpg')] videos.append({'id': video_id}) except ValueError: continue emoji = [ emoji_node.attrs['title'] for emoji_node in tweet.find('.Emoji') ] correcttweet = retweets == True or originaluserId.lower( ) == user.lower() tweetsize = len(text) accepttweet = notext == True or tweetsize > 0 if correcttweet and accepttweet: if adddot and tweetsize > 0: if not (text[-1] == '!' or text[-1] == '?' or text[-1] == '.'): text += '.' text = text.replace(' .', '.') tweets.append({ 'tweetId': tweetId, 'time': time, 'user': user, 'originaluser': originaluserId, 'text': text, 'replies': replies, 'retweets': retweets, 'likes': likes, 'entries': { 'hashtags': hashtags, 'emoji': emoji, 'urls': urls, 'photos': photos, 'videos': videos } }) for tweet in tweets: if tweet and found > 0: found += -1 yield tweet if json['has_more_items'] == True: last_tweet = html.find( '.stream-item')[-1].attrs['data-item-id'] r = session.get(url, params={'max_position': last_tweet}, headers=headers) pages += -1 else: # reset the count regardless since there are no more tweets left found = 0
def clean_text(text): text = ftfy.fix_text(text, normalization="NFKC") return NOA.sub(" ", text)
heads = train['title'] heads descs = train['content'] descs heads = heads[:50] descs = descs[:50] title_list = [] for i in heads: title = ftfy.fix_text(i) title_list.append(title) #print(title) #print('---------------') title_list content_list = [] for i in descs: descs = ftfy.fix_text(i) content_list.append(descs) #print(descs) #print('---------------') content_list
async def trivia(cmd: SigmaCommand, message: discord.Message, args: list): global streaks if await cmd.bot.cool_down.on_cooldown(cmd.name, message.author): timeout = await cmd.bot.cool_down.get_cooldown(cmd.name, message.author) on_cooldown = discord.Embed( color=0xccffff, title=f'❄ On cooldown for another {timeout} seconds.') await message.channel.send(embed=on_cooldown) return try: if message.author.id not in ongoing_list: ongoing_list.append(message.author.id) allotted_time = 20 trivia_api_url = 'https://opentdb.com/api.php?amount=1' cat_chosen = False if args: catlook = args[-1].lower() for cat in categories: cat_alts = categories.get(cat) if catlook in cat_alts: trivia_api_url += f'&category={cat}' cat_chosen = True break diflook = args[0].lower() if diflook in ['easy', 'medium', 'hard']: trivia_api_url += f'&difficulty={diflook}' cat_chosen = True async with aiohttp.ClientSession() as session: async with session.get(trivia_api_url) as number_get: number_response = await number_get.read() try: data = json.loads(number_response).get('results')[0] except json.JSONDecodeError: if message.author.id in ongoing_list: ongoing_list.remove(message.author.id) decode_error = discord.Embed( color=0xBE1931, title='❗ Couldn\'t retrieve a question.') await message.channel.send(embed=decode_error) return await cmd.bot.cool_down.set_cooldown(cmd.name, message.author, 30) question = data['question'] question = ftfy.fix_text(question) question = re.sub(r'([*_~`])', r'\\\1', question) # escape markdown formatting category = data['category'] correct_answer = data['correct_answer'] correct_answer = ftfy.fix_text(correct_answer) incorrect_answers = data['incorrect_answers'] difficulty = data['difficulty'] reward_mult = streaks.get( message.author.id) or 0 if not cat_chosen else 0 kud_reward = int( (awards.get(difficulty) or '10') * (1 + (reward_mult * 3.25) / (1 + (0.03 * reward_mult)))) choice_list = [correct_answer] + incorrect_answers choice_list = shuffle_questions(choice_list) choice_number = 0 choice_lines = [] for choice in choice_list: choice_number += 1 choice_line = f'[{choice_number}] {choice}' choice_lines.append(choice_line) choice_text = '\n'.join(choice_lines) choice_text = ftfy.fix_text(choice_text) if difficulty == 'easy': starter = 'An' else: starter = 'A' question_embed = discord.Embed(color=0xF9F9F9, title='❔ Here\'s a question!') question_embed.description = f'{starter} {difficulty} one from the {category} category.' question_embed.add_field(name='Question', value=question, inline=False) question_embed.add_field(name='Choices', value=f'```py\n{choice_text}\n```', inline=False) question_embed.set_footer( text='Input the number of your chosen answer.') question_embed.set_author(name=message.author.display_name, icon_url=user_avatar(message.author)) await message.channel.send(embed=question_embed) def check_answer(msg): if message.channel.id != msg.channel.id: return if message.author.id != msg.author.id: return if msg.content.isdigit(): if abs(int(msg.content)) <= len(choice_lines): return True else: return elif msg.content.title() in choice_list: return True try: answer_message = await cmd.bot.wait_for('message', check=check_answer, timeout=allotted_time) try: answer_index = int(answer_message.content) - 1 except ValueError: answer_index = None correct_index = get_correct_index(choice_list, correct_answer) if answer_index == correct_index or answer_message.content.lower( ) == correct_answer.lower(): if cat_chosen: streaks.update( {message.author.id: reward_mult + 0.005}) else: streaks.update({message.author.id: reward_mult + 1}) await cmd.db.add_currency(answer_message.author, message.guild, kud_reward) author = answer_message.author.display_name currency = cmd.bot.cfg.pref.currency win_title = f'🎉 Correct, {author}, it was {correct_answer}. You won {kud_reward} {currency}!' final_embed = discord.Embed(color=0x77B255, title=win_title) else: if message.author.id in streaks: streaks.pop(message.author.id) lose_title = f'💣 Ooh, sorry, it was {correct_answer}...' final_embed = discord.Embed(color=0x262626, title=lose_title) await message.channel.send(embed=final_embed) except asyncio.TimeoutError: if message.author.id in streaks: streaks.pop(message.author.id) timeout_title = f'🕙 Time\'s up! It was {correct_answer}...' timeout_embed = discord.Embed(color=0x696969, title=timeout_title) await message.channel.send(embed=timeout_embed) if message.author.id in ongoing_list: ongoing_list.remove(message.author.id) else: ongoing_error = discord.Embed( color=0xBE1931, title='❗ There is already one ongoing.') await message.channel.send(embed=ongoing_error) except Exception: if message.author.id in ongoing_list: ongoing_list.remove(message.author.id) raise
def read_sd(self, f, fref=None): """Read document contents from a ScienceDirect XML file.""" def get_para_sents(p): if p.find('list'): # Really this needs to be split into the paragraph text # before and after the list, but BeautifulSoup is a pain, and # this is good enough. l = p.find('list').replace_with(' ... ') sents = [ re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text()) ] for para in l.find_all(['para', 'simple_para']): sents.extend([ re.sub(r'\s+', ' ', x) for x in st.tokenize(para.get_text()) ]) return sents return [re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())] if '-ref.xml' in f: return xml = io.open(f, 'r', encoding='utf-8').read() xml = ftfy.fix_text(xml, uncurl_quotes=False, fix_entities=False) xml = strtr(xml, {'e´': 'é', 'e`': 'è'}) xml = re.sub("([</])(dc|prism|ce|sb|xocs):", r"\1", xml) soup = BeautifulSoup(xml, 'lxml') try: pii = re.sub('[()-.]', '', soup.find('pii').string) except: print('No PII found for', f) return self.id = 'sd-' + pii.lower() self.authors = [] try: for author in soup('creator'): x = author.string.strip() name = re.sub('^.*, ', '', x) + ' ' + re.sub(',.*$', '', x) self.authors.append(name) except: pass if not self.authors and soup.editor: self.authors = [ x.get_text() + ' (ed.)' for x in soup.editor('authors') ] if soup.title: self.title = soup.title.string.strip() if soup.publicationname: self.book = soup.publicationname.string.strip() self.url = 'http://www.sciencedirect.com/science/article/pii/' + pii if soup.coverdate: # Dates are in format YYYY-MM-DD self.year = int(re.sub('-.*', '', soup.coverdate.string)) st = SentTokenizer() if soup.abstract: sec = { 'heading': 'Abstract', 'text': st.tokenize(soup.find('abstract-sec').get_text()) } self.sections.append(sec) sec_id = '' sec = {'text': []} sec_last = {'text': []} for p in soup.find_all(['para', 'simple-para']): if p.find_parents('outline'): continue elif p.find('list') and p.find('list').find('section-title'): continue elif p.find_parents('para'): continue elif p.find_parents('floats'): # Lest these show up at the start and be treated as an # abstract. sec_last['text'] += get_para_sents(p) continue if p.parent.name in ['section', 'biography']: p_sec_id = p.parent.get('id', '') if p_sec_id != sec_id: if sec['text']: self.sections.append(sec) sec = {'text': []} sec_id = p_sec_id heading = p.parent.find('section-title') if heading and heading.string: sec['heading'] = heading.string.strip() elif p.parent.name == 'biography': sec['heading'] = 'Biography' sec['text'] += get_para_sents(p) if sec['text']: self.sections.append(sec) if sec_last['text']: self.sections.append(sec_last) if soup.rawtext and len(self.sections) < 3: self.sections.append( {'text': st.tokenize(soup.rawtext.get_text())}) if len(self.text()) < 200: print(' ! Skip:', self.title, self.id + '. Missing text.') return if not fref: fref = f.replace('-full.xml', '-ref.xml') if os.path.exists(fref): reftext = io.open(fref, 'r', encoding='utf-8').read() self.references = set([ x.replace('PII:', 'sd-').lower() for x in re.findall('PII:[^<]+', reftext) ])
import json import argparse from ftfy import fix_text from pprint import pprint parser = argparse.ArgumentParser() parser.add_argument('--source-file', required=True) args = parser.parse_args() with open(args.source_file) as data_file: game_data = json.load(data_file) rounds = ('jeopardy', 'double-jeopardy') for round in rounds: for category in game_data[round]: category['name'] = fix_text(category['name'].upper()) for q in category['questions']: q['question'] = fix_text(q['question'].upper()) q['answer'] = fix_text(q['answer']) game_data['final-jeopardy']['category'] = fix_text(game_data['final-jeopardy']['category'].upper()) game_data['final-jeopardy']['question'] = fix_text(game_data['final-jeopardy']['question'].upper()) name, ext = os.path.splitext(args.source_file) outfile_name = "{name}_{uid}{ext}".format(name=name, uid='formatted', ext=ext) with open(outfile_name, 'w') as outfile: json.dump(game_data, outfile, indent=4)
def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip()
# show the output images #cv2.imshow("Image", image) #cv2.imshow("Output", gray) #cv2.waitKey(5000) # writing extracted data into a text file text_output = open('outputbase.txt', 'w', encoding='utf-8') text_output.write(text) text_output.close() file = open('outputbase.txt', 'r', encoding='utf-8') text = file.read() # print(text) # Cleaning all the gibberish text text = ftfy.fix_text(text) text = ftfy.fix_encoding(text) print(text) # Initializing data variable name = None fname = None dob = None pan = None nameline = [] dobline = [] panline = [] text0 = [] text1 = [] text2 = []
import ftfy except ImportError: print( 'To use fix_encodings the ftfy package needs to be installed. Please install it using pip' ) exit() tagger = HUNERTagger(names=[args.name]) split_sentences = not args.assume_sentence_splitted and not args.assume_tokenized with open(args.input) as f_in, open(args.output, 'w') as f_out: buff = [] c = 0 for line in f_in: if args.fix_encoding: line = ftfy.fix_text(line) line = line.encode('ascii', 'ignore').decode() if args.assume_tokenized: line = [line.split()] elif args.assume_sentence_splitted: line = [line] else: line = [line] c += 1 buff += line if c % args.batchsize == 0: tagged_line = tagger.tag(buff, split_sentences=split_sentences, tokenize=not args.assume_tokenized)[0] for sentence in tagged_line: for tok, tag in sentence:
text = re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE) if len(text.strip()) == 0: continue clean_html = "" tree = "" try: if options.cleanhtml: # HTML is then normalized logging.info(url + ": cleaning HTML") clean_html = cleaner.clean_html(text) else: clean_html = text if options.ftfy: tree = ftfy.fix_text(clean_html, fix_entities=False, fix_character_width=False) else: tree = clean_html except Exception as ex: logging.info("Skipping " + url + ": " + str(ex)) continue clean_tree = tree.replace(" ", " ") clean_tree = clean_tree.replace("\t", " ") clean_tree = clean_tree.encode('utf-8') if http_headers: http_headers.replace_header('Content-Length', str(len(clean_tree))) http_headers.replace_header('Content-Type', 'text/html') new_record = fo.create_warc_record(uri=url, record_type=record_type, warc_content_type=record.content_type, payload=BytesIO(clean_tree), http_headers=http_headers) fo.write_record(new_record)
def get_title(entry: Dict[str, Any]) -> Optional[str]: title = entry.get("title") if title: return ftfy.fix_text(title) else: return None
def clean_text(text, remove_newlines=False) -> str: cleaned = fix_text(BeautifulSoup(text, "lxml").text.strip()) if remove_newlines: cleaned = re.sub(newline_regex, " ", cleaned) return cleaned
with open('pilgub.csv', 'a', newline='', encoding='UTF-8') as f: thewriter = csv.writer(f) # first write csv thewriter.writerow(["tweet", "clean_tweet", "sentiment"]) for tweet in tweepy.Cursor(api.search, q=query + "-filter:retweets", lang="id", show_user="******", since='2018-06-04', until='2018-07-09').items(max_tweets): #process single status if 'RT ' not in tweet.text: rt = rt + 1 texts = fix_text(tweet.text, remove_terminal_escapes=True, remove_bom=True, remove_control_chars=True) if '…' in texts: texts = texts.rsplit(' ', 1)[0] #remove symbol unicode texts = texts.encode('ascii', 'ignore').decode('utf-8') """ if preprocess_tweet(texts) == ' ': continue if preprocess_tweet(texts) == '': continue """ print("============================") print(tweet.retweeted) print("============================") print("Ver: Ori")
def test_lix(text, expected, nlp): text = ftfy.fix_text(text) text = " ".join(text.split()) doc = nlp(text) assert pytest.approx(expected, rel=1e-2) == doc._.readability["lix"]
def open_file(self) -> Iterable: raw_file = open(self.file_path(), "rb").read().decode(errors="replace") fixed_file = ftfy.fix_text(raw_file) reader = csv.DictReader(fixed_file.split("\n"), delimiter="\t") return reader