def _id_field_iter(text):
    current_id = None
    for match in re.finditer(r'^(\w+)\s+=\s+\{\s*(.*?)\s*\}$', text, re.MULTILINE | re.DOTALL):
        key, value_bytes = match.groups()
        # &#X only happens once, but it's wrong and ftfy doesn't treat it the same way as &#x
        value_bytes = value_bytes.replace('&#X', '&#x')
        # same for ,, -- only happens once
        value_bytes = value_bytes.replace(',,', ',')
        # reverse line feed? no thanks
        value_bytes = value_bytes.replace('', '')
        # I don't even know...
        value_bytes = value_bytes.replace('&#scaron;', 'š')
        # you'd think that ;; sequences would make sense -- a non-final author
        # with a first name ending in an html entity, but this actually fixes
        # more things than it breaks (need to address the few cases where it does break)
        value_bytes = value_bytes.replace(';;', ';')
        # UTF-8 is a better first guess, but will break on some of the input
        try:
            value_unicode = value_bytes.decode('UTF-8')
        except UnicodeDecodeError:
            value_unicode = value_bytes.decode('ISO-8859-2')
        value = fix_text(value_unicode, fix_entities=True, normalization='NFKC')
        # ftfy docs says it will repeat if needed, but it doesn't?
        value = fix_text(value, fix_entities=True, normalization='NFKC')
        if key == 'id':
            current_id = value
        yield current_id, (key, value)
Beispiel #2
0
def test_real_text():
    """
    Test with text actually found in the wild (mostly on Twitter).

    I collected test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    TEST_CASES contains the most interesting examples of these, often with some
    trickiness of how to decode them into the actually intended text.

    For some reason, sampling Twitter gives no examples of text being
    accidentally decoded as Windows-1250, even though it's one of the more
    common encodings and this mojibake has been spotted in the wild. It may be
    that Windows-1250 is used in places that culturally don't use Twitter much
    (Central and Eastern Europe), and therefore nobody designs a Twitter app or
    bot to use Windows-1250. I've collected a couple of examples of
    Windows-1250 mojibake from elsewhere.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
Beispiel #3
0
def test_real_tweets():
    """
    Test with text actually found on Twitter.

    I collected these test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    The tweets that appear in TEST_CASES are the most interesting examples of
    these, with some trickiness of how to decode them into the actually intended
    text.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
Beispiel #4
0
 def _process_tag(story):
     tags = story.find("Editor_Tags")
     return {
         "date": datetime.strptime(story.get("Date"), "%d %b %Y"),
         "story_id": int(story.get("StoryId")),
         "headline": fix_text(unicode(story.find("Headline").text)),
         "text": fix_text(unicode(story.find("Story_text").text)),
         "source": story.find("Source").text,
         "tags": {int(e.attrib.get("OrgID")): e.attrib.get("Relevance") for e in tags} if tags is not None else [],
     }
Beispiel #5
0
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    eq_(fix_text(example), '&\n<html>\n&amp;')
    eq_(fix_text_segment(example), '&amp;\n<html>\n&amp;')

    eq_(fix_text(example, fix_entities=True), '&\n<html>\n&')
    eq_(fix_text_segment(example, fix_entities=True), '&\n<html>\n&')

    eq_(fix_text(example, fix_entities=False), '&amp;\n<html>\n&amp;')
    eq_(fix_text_segment(example, fix_entities=False), '&amp;\n<html>\n&amp;')

    eq_(fix_text_segment('&lt;&gt;', fix_entities=False), '&lt;&gt;')
    eq_(fix_text_segment('&lt;&gt;', fix_entities=True), '<>')
    eq_(fix_text_segment('&lt;&gt;'), '<>')
Beispiel #6
0
    def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
            elif 'â€' in text or '\x80' in text:
                print('\nNot fixed:\t{text!r}'.format(text=text))

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
Beispiel #7
0
 def __init__(self, url):
     """ Init obj """
     super(WatchHerePage, self).__init__(url)
     if not self.is_watch_here_page():
         raise PageTypeError('Not a Watch Here page')
     self.desc = ftfy.fix_text(self.entry.p.text)
     self.watch_here_link = get_abs_ct_url(self.entry.a['href'])
Beispiel #8
0
def standardize_as_list(text, token_filter=None):
    """
    Get a list of tokens or stems that appear in the text.

    `token_filter` is an optional function to apply to the list of tokens,
    performing language-specific lemmatization and stopword removal. In
    practice, the only such filter is for English.

    >>> standardize_as_list('the dog', token_filter=english_filter)
    ['dog']
    >>> standardize_as_list('big dogs', token_filter=english_filter)
    ['big', 'dog']
    >>> standardize_as_list('big dogs')
    ['big', 'dogs']
    >>> standardize_as_list('to go', token_filter=english_filter)
    ['go']
    >>> standardize_as_list('the', token_filter=english_filter)
    ['the']
    >>> standardize_as_list('to', token_filter=english_filter)
    ['to']
    """
    text = fix_text(text)
    tokens = [token for token in simple_tokenize(text)]
    if token_filter is not None:
        tokens = token_filter(tokens)
    return tokens
def tokenizer(text, tokenizer_fn, to_lower=False):
    text = ftfy.fix_text(text)
    if to_lower:
        text = text.lower()
    try:
        seq = Sequence(text.strip())
    except ValueError:
        return
    tokens = tokenizer_fn.transform(seq)
    new_tokens = []
    for token in tokens:
        if token.strip() == "":
            continue
        elif PUNCTSYM.search(token):
            token = "$"
        elif LIKENUM.search(token):
            token = "0"
        elif LIKEUNIT.search(token):
            token = LIKEUNIT.sub(r"0 \1", token)
        elif token == "can't":
            token = "can not"
        elif CONTRACTION1.search(token):
            token = CONTRACTION1.sub(r"\1 '\2", token)
        elif CONTRACTION2.search(token):
            token = CONTRACTION2.sub(r"\1 n't", token)
        new_tokens.append(token)
    if new_tokens:
        return " ".join(new_tokens).strip()
    return
Beispiel #10
0
def read_clusters(clusters_loc):
    clusters = {}
    if ftfy is None:
        user_warning(Warnings.W004)
    with clusters_loc.open() as f:
        for line in tqdm(f):
            try:
                cluster, word, freq = line.split()
                if ftfy is not None:
                    word = ftfy.fix_text(word)
            except ValueError:
                continue
            # If the clusterer has only seen the word a few times, its
            # cluster is unreliable.
            if int(freq) >= 3:
                clusters[word] = cluster
            else:
                clusters[word] = "0"
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters
Beispiel #11
0
def get_article_sentences(article): 
    subs = {'Sen.':'Senator','Lt. Gov.':'Lieutenant Governor','Rep.':'Representative','Reps.':'Representatives,', 'Gov.':'Governor'}

    if 'body' in article:
        text = fix_text(article['body']).replace('?"', '? "').replace('!"', '! "').replace('."', '. "')
        for a in subs:
            text = text.replace(a,subs[a])
            sentences = sentence_splitter.tokenize(text)
        return sentences
    elif 'text' in article:
        text = fix_text(article['text']).replace('?"', '? "').replace('!"', '! "').replace('."', '. "')
        for a in subs:
            text = text.replace(a,subs[a])
            sentences = sentence_splitter.tokenize(text)
        return sentences
    return []
Beispiel #12
0
def buildHTML(room, date, timezone):
    datestring = str(date.month) +"/" + str(date.day) + "/" + str(date.year)
    messages = msgByDate(room, date,timezone)
    roomtitle = room.title
    if len(messages) == 0:
        print "no messages to send, exiting"
        sys.exit(0)
    else:
        for message in reversed(messages):
            if u'text' in message.keys():
                message['localmsgtime'] = shiftToLocal(message[u'created'], timezone)
                message['timestamp'] = timeFixUp(message['localmsgtime'].hour) + ":" + timeFixUp(message['localmsgtime'].minute) + ":" + timeFixUp(message['localmsgtime'].second)
                message['displayname'] = getDisplayName(message['personId'], room.users)
                ##Need to work on this more.  Need to account for Ellips before we ignore it.  Need to figure out bullets
                ##For now this will just ignore thigns that don't directly convert to ascii
                message[u'text'] = message[u'text'].encode("ascii","ignore")
                message[u'text'] = message[u'text'].decode("utf-8")
                message[u'text'] = fix_text(message[u'text'], normalization="NFKC")

    env = Environment(loader=PackageLoader('sparkdaily', 'templates'))
    template = env.get_template('newsletter.html')
    html = template.render(roomtitle=roomtitle, messages=reversed(messages), datestring=datestring)


    return html
Beispiel #13
0
def transcribeStory(story:list):
    """
    Takes a list of dicts that contain the story text and metadata and
    transcribes it into the formatted book.
    """
    flushBook()
    #renderer = mistune.Renderer(escape=True, hard_wrap=True)
    #markdown = mistune.Markdown(renderer = renderer)
    for t in story:
        text_translate_table[(t['type'])](t)
    for c in story:
        try:
            addCitation([c['cite']])
        except KeyError as err:
            continue
    
    result = readBook()
    with open("output.markdown", mode='wt', encoding="utf-8") as file:
        file.write(result)
    
    renderer = mistune.Renderer(escape=True, hard_wrap=False)
    markdown = mistune.Markdown(renderer = renderer)
    htmltext = ftfy.fix_text(markdown(result))
    with open("output.html", mode='wt', encoding="utf-8") as file:
        file.write(htmltext)
    return result
Beispiel #14
0
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.

    Only words with a value greater than or equal to `cutoff` are returned.

    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.

    If `lang` is given, it will apply language-specific tokenization to the
    words that it reads.
    """
    values = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
            if val < cutoff or len(values) >= max_size:
                break
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                values[token] += val
                total += val
    return values, total
Beispiel #15
0
 def tidy(self):
     tidied = []
     for obj in Lexicon.objects.all():
         new = fix_text(obj.entry.strip(), fix_entities=True, normalization="NFKC", uncurl_quotes=True)
         if obj.entry != new:
             tidied.append((obj, new))
     return tidied
Beispiel #16
0
def webfix_unicode(possible_string):
    """
    This is ugly but it will create Times-approved HTML
    out of terrible cut-and-paste from decision text.
    """
    CHAR_MAP = [
        (u'\xa7', u'&sect;'),
        (u'\u2014', u'&mdash;'),
        (u'\u2013', u'&ndash;'),
        (u'\x97', u'&mdash;'),
        (u'\xa4', u'&euro;'),
        (u'\u201c', u'"'),
        (u'\u201d', u'"'),
        (u'\x96', u'&#150;'),
    ]

    if isinstance(possible_string, basestring):
        string = possible_string
        string = string.strip()
        for char, replace_char in CHAR_MAP:
            string = string.replace(char, replace_char)
        string = string.decode('utf-8')
        string = unicode(string)
        string = ftfy.fix_text(string)
        string = smartypants.smartypants(string)
        return string

    return possible_string
Beispiel #17
0
def preprocess_text(text):
    """
    Given any basestring as input, make its representation consistent:

    - Ensure that it is a Unicode string, converting from UTF-8 if
      necessary.
    - Detect whether the text was incorrectly encoded into UTF-8 and fix it,
      as defined in `fix_bad_unicode`.
    - Replace HTML entities with their equivalent characters.
    - Replace newlines and tabs with spaces.
    - Remove all other control characters.
    - Normalize it with Unicode normalization form KC, which applies the
      following relevant transformations:
      - Combine characters and diacritics that are written using separate
        code points, such as converting "e" plus an acute accent modifier
        into "é", or converting "ka" (か) plus a dakuten into the
        single character "ga" (が).
      - Replace characters that are functionally equivalent with the most
        common form: for example, half-width katakana will be replaced with
        full-width, and full-width Roman characters will be replaced with
        ASCII characters.
    """
    if isinstance(text, str):
        text = text.decode('utf-8')
    return fix_text(text)
def test_json_example(test_case):
    # Run one example from the data file
    orig = test_case['original']
    fixed = test_case['fixed']

    # Make sure that the fix_encoding step outputs a plan that we can
    # successfully run to reproduce its result
    encoding_fix, plan = fix_encoding_and_explain(orig)
    assert apply_plan(orig, plan) == encoding_fix

    # Make sure we can decode the text as intended
    assert fix_text(orig) == fixed
    assert encoding_fix == test_case.get('fixed-encoding', fixed)

    # Make sure we can decode as intended even with an extra layer of badness
    extra_bad = orig.encode('utf-8').decode('latin-1')
    assert fix_text(extra_bad) == fixed
 def encode(self, texts, verbose=True):
     texts_tokens = []
     if verbose:
         for text in tqdm(texts, ncols=80, leave=False):
             text = self.nlp(text_standardize(ftfy.fix_text(text)))
             text_tokens = []
             for token in text:
                 text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
             texts_tokens.append(text_tokens)
     else:
         for text in texts:
             text = self.nlp(text_standardize(ftfy.fix_text(text)))
             text_tokens = []
             for token in text:
                 text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
             texts_tokens.append(text_tokens)
     return texts_tokens
Beispiel #20
0
def smart_unicode(text):
    if text:
        if not isinstance(text, unicode):
            text = unicode(text, "utf8")

        text = ftfy.fix_text(text)

    return text
    def __call__(self, si, context):
        if not si.body or not si.body.raw:
            return si

        read_from = getattr(si.body, self.config['read_from'])
        setattr(si.body, self.config['write_to'],
                ftfy.fix_text(read_from.decode('utf-8')).encode('utf-8'))
        return si
def standardize(text, lang='en', remove_accents=True):
    text = fix_text(text)
    if remove_accents and (lang=='es' or text.startswith('/c/es/')):
        text = normalize('NFD', text).encode('ascii', errors='ignore').decode()
    if text.startswith('/c/'):
        return replace_numbers(text)
    else:
        return standardized_concept_uri(text, lang)
Beispiel #23
0
 def parse(self):
     """
     Removes all unicode characters, nonprintable characters,
     and unneeded special characters.
     This formats the text for audio reading.
     """
     try: # Attempt to scrub the unicode with a library
         text = ftfy.fix_text(self.text)
         self.text = unidecode.unidecode(text).replace('[?]', '')
     except Exception: # If that fails, kill it with fire.
         print("Nuking the text.")
         text = bytes(self.text, 'utf-8')
         text = text.decode('unicode_escape')
         text = text.encode('ascii', 'ignore')
         text = text.decode('utf-8')
         self.text = str(text)
     
     try: # Try to translate the story into the reader's language
         if self.language != language:
             self.translate(language)
     except:
         pass
     
     # Formats text to remove odd artifacts from the conversion
     self.changes.update({
         '\n': ' ',          '\r': ' ',
         '"': "'",           '.': '. ',
         '.   .   . ': '',   '. . .': '...',
         "\'": "'",           '\"': '',
         ':': ': ',          ':  ': ': ',
         '!': '! ',          '!  ': '! ',
         '?': '? ',          '?  ': '? ',
         ';': '; ',          ';  ': '; ',
         '0': '0 ',          '1': '1 ',
         '2': '2 ',          '3': '3 ',
         '4': '4 ',          '5': '5 ',
         '6': '6 ',          '7': '7 ',
         '8': '8 ',          '9': '9 '
               })
     if self.speech == 'local':
         # The Microsoft SAPI pronunciation is a bit off
         self.changes.update({
                    'Tali': 'Tahlie',     'tali': 'tahlie',
                    'Yalo': ' Yah-lo ',   'caf ': 'cafe ',
                    'Garrus': 'Gae-rrus', 'Klenon': 'Klenn une',
                    'Binary': 'Bi-nary',  'Noveria': ' No-veir-eaah ',
                    'Vakarian': 'Vah-kare-eean'
                   })
     else:
         # Google's TTS is better at its job :)
         self.changes.update({
                    'Tali': 'Tahhlee', 'tali': 'Tahhlee',
                    'caf ': 'cafe '
                   })
     # Apply the changes to the text.
     for original_word, changed_word in self.changes.items():
         self.text = self.text.replace(original_word, changed_word)
Beispiel #24
0
	def putline(self, line):
		line = line + '\r\n'
		if self.debugging > 1:
			print('*put*', self.sanitize(line))

		# FORCE the line to ALWAYS be utf-8.
		line = ftfy.fix_text(line)
		line = line.encode("UTF-8")
		self.sock.sendall(line)
Beispiel #25
0
 def __init__(self, text, frenchspacing=False, normalise=False):
   self.data = text
   if normalise:
     self.data = ftfy.fix_text(self.data)
   if not frenchspacing:
       self._sentence_to_interstitial_spacing()
       self._interstitial_to_sentence_spacing()
   self._latex_symbols()
   self._hyphens_to_dashes()
Beispiel #26
0
def clean_unicode(possible_string):
    if isinstance(possible_string, basestring):
        string = possible_string
        string = string.strip()
        string = string.decode('utf-8')
        string = unicode(string)
        string = ftfy.fix_text(string)
        return string
    return possible_string
def standardized_concept_uri(text, lang='en'):
    text = fix_text(text)
    tokens = simple_tokenize(text)
    if lang == 'en':
        tokens = english_filter(tokens)

    tokens = [replace_numbers(token) for token in tokens]
    slug = replace_numbers('_'.join(tokens))

    return '/'.join(['/c', LCODE_ALIASES.get(lang, lang), slug])
Beispiel #28
0
 def txt2words(self, txt, remove_stopwords=True):
   txt = BeautifulSoup(txt).get_text()
   txt = ftfy.fix_text(txt)
   txt = txt.replace("\\n", '')
   txt = re.sub("[^0-9a-zA-Z]"," ", txt)
   if remove_stopwords:
     words = [self.save_stem(w) for w in txt.lower().split() if (w not in self.stopwords) & (len(w) > 2) & (not w.isdigit())]
   else:
     words = [self.save_stem(w) for w in txt.lower().split() if (len(w) > 2) & (not w.isdigit())]
   return words
def make_ascii_friendly(text): return ftfy.fix_text(text)









LISTING = '/Users/Kristen/PycharmProjects/proj/Other/data/current_dists.txt'
    def _deserialize(self, value, attr, data):
        """Deserialize sanitized string value."""
        value = super(SanitizedUnicode, self)._deserialize(value, attr, data)
        value = fix_text(value)

        # NOTE: This `join` might be ineffiecient... There's a solution with a
        # large compiled regex lying around, but needs a lot of tweaking.
        value = ''.join(filter(self.is_valid_xml_char, value))
        for char in self.UNWANTED_CHARACTERS:
            value = value.replace(char, '')
        return value
Beispiel #31
0
def generate_corpus_for_quality_evaluation(k, pz_d, tweets,
                                           topic_words_distribution):
    all_tweets = []
    logger.info(k)
    df = pd.read_csv(tweets, encoding='utf-8')
    for index, row in df.iterrows():
        all_tweets.append(row['tweets'])

    tweets_pz_d = []
    with open(pz_d) as f:
        for l in f:
            line = l.strip().split(' ')
            tweets_pz_d.append([float(p) for p in line])

    results = {}
    for j in range(len(tweets_pz_d)):
        if 'nan' not in tweets_pz_d[j] and '-nan' not in tweets_pz_d[j]:
            sorted_pz_ds = list(tweets_pz_d[j])
            sorted_pz_ds.sort(reverse=True)
            topic_id = tweets_pz_d[j].index(sorted_pz_ds[0])
            if topic_id not in results:
                results[topic_id] = [all_tweets[j]]
            else:
                results[topic_id].append(all_tweets[j])

    final_result = []
    for tp in results:
        for keyword in topic_words_distribution[tp][1]:
            temp = []
            dedup = set()
            for tweet in results[tp]:
                if '%s' % keyword[0] in tweet.lower():
                    clean_text_list = (common.cleanhtml(
                        common.remove_username(
                            common.remove_url(ftfy.fix_text(
                                tweet.lower()))))).strip(' ').replace(
                                    '\n', ' ').split(' ')[:-1]
                    clean_text = ",".join(str(x) for x in clean_text_list)
                    if clean_text not in dedup:
                        temp.append(tweet)
                        dedup.add(clean_text)

            # samples_number = random.sample(range(1, len(temp)), 1)
            # if (tp == 6) and (keyword[0] == 'u.s.'):
            #     logger.info(temp)
            #     quit()

            samples_number = []
            if len(temp) <= 2:
                samples_number = range(len(temp))
            else:
                samples_number = random.sample(range(1, len(temp)), 2)
            for i in samples_number:
                result = {}
                result['topic_id'] = tp
                result['keyword'] = keyword[0]
                result['propability'] = keyword[1]
                result['tweet'] = temp[i]
                final_result.append(result)

    to_csv(
        final_result,
        '../../papers/2017_BMC_HPV/analysis/BTM/quality_evaluation/' + str(k) +
        'tp.csv')
Beispiel #32
0
 def _text_from_page(self, page: fitz.Page) -> str:
     bloks = page.get_text_blocks()
     text = [blk[4].replace("\n", " ") for blk in bloks if blk[-1] == 0]
     text = "\r\n".join(text)
     return ftfy.fix_text(text, normalization="NFKC")
Beispiel #33
0
def remove_specific_stop(words):
    punct = ['%', ',', '/', '(', ')', '.'] # frequent punctuation terms inside strings or digits
    for p in punct:
        words = words.replace(p, ' ')
        words = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", words)
    return words


texts, article = [], []
texts_txt = ''
for year in years_available:
    file_list = glob.glob("sources_data/{}/*.txt".format(year))
    for f in file_list:
        words = open(f).read()
        words = fix_text(words) # Fix any unicode problem
        words = words.replace('\n', ' ').replace('\r', '') # remove line breaks
        words = remove_specific_stop(words)
        words = gb_to_us(words)
        if(len(words.split()) >= 30): # Only abstracts with at least 30 words
            nlp_words = nlp(words)
            for word in nlp_words:
                if not is_noise(word):
                    article.append(word.lemma_)
            texts.append(article)
            texts_txt = texts_txt + ' '.join(article) + '\n'
            article = []

    with open("{}{}.pickle".format(DATA_CLEAN, year), "wb") as fp:
        pickle.dump(texts, fp)
Beispiel #34
0
except Exception as e:
    syslog.syslog(str(e))
stdin_used = True
#except Exception as e:
#    if debug:
#        syslog.syslog("FATAL ERROR: Not all required input received")
#        print(str(e))
#        syslog.syslog(str(e))
#    sys.exit(1)

#if debug:
#    syslog.syslog("Encoding of subject: {0}".format(ftfy.guess_bytes(email_subject)[1]))
#    syslog.syslog("Encoding of body: {0}".format(ftfy.guess_bytes(email_data)[1]))

try:
    email_data = ftfy.fix_text(email_data.decode("utf-8", "ignore"))
except:
    email_data = ftfy.fix_text(email_data)
try:
    email_subject = ftfy.fix_text(email_subject.decode("utf-8", "ignore"))
except:
    email_subject = ftfy.fix_text(email_subject)

if debug:
    syslog.syslog(email_subject)
    syslog.syslog(email_data)

misp_url = config.misp_url
misp_key = config.misp_key
misp_verifycert = config.misp_verifycert
import pandas as pd
import numpy as np

data_path = '/Users/fredde/Database/'

df_load = pd.read_hdf(data_path + 'all_data_1year_comp.h5', 'table')

df_load.head()
import ftfy
a = list(df_load)
for i in range(len(a)):
    a[i] = ftfy.fix_text(a[i])

print(a)

df_load[list(df_load)[1]].plot()

headers = open(data_path + 'headers.csv', 'w')
for item in a:
    headers.write('\n' + str(item.encode('utf-8')))
Beispiel #36
0
def fix_text_encoding(sentence: str):
    return fix_text(sentence)
Beispiel #37
0
for record in f:
    # We convert into UTF8 first of all
    orig_encoding, text = convert_encoding(record.payload.read())
    url = record.url

    if orig_encoding is None:
        logging.info("Encoding of document " + url + " could not be identified")

    if len(text) > 0:
        # HTML is then normalized
        cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

        tree=""
        try:
            cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE))
            document = html5lib.parse(ftfy.fix_text(cleanhtml), treebuilder="lxml", namespaceHTMLElements=False)
            tree = etree.tostring(document)
        except:
            continue

        tree = etree.tostring(document)
        cleantree = tree.decode("utf8").replace("&#160;", " ")
        cleantree = cleantree.replace("\t", " ")

        # lang id
        lang = guess_lang_from_data2(cleantree)
        if len(languages) > 0 and lang not in languages:
            logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.")
        else:
            # If enabled, remove boilerplate HTML
            if options.boilerpipe:
Beispiel #38
0
 def clean_wp_token(self, text):
     text = text.replace("\u0120", "", 1)
     text = text.replace("\u010a", "", 1)
     text = ftfy.fix_text(text)
     text = clean_extended_unicode(text)
     return text.strip()
Beispiel #39
0
 def clean_wp_token(self, text):
     text = ftfy.fix_text(text)
     text = clean_extended_unicode(text)
     text = self._replace_re.sub("", text)
     return text.replace("</w>", "").strip()
 def _standardize(text):
     return text_standardize(ftfy.fix_text(text))
Beispiel #41
0
    def pre_processing_dataset(self):
        self.tweets = pd.read_csv("mydatasetnew.csv",
                                  usecols=['author', 'sentence', 'type'])
        df = pd.read_csv("contractions.csv", usecols=['col1', 'col2'])
        contractions_dict = dict(zip(list(df.col1), list(df.col2)))
        self.sentence_list, self.type_list, self.author_list = [], [], []
        c_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

        def expand_contractions(text, c_re=c_re):
            def replace(match):
                return contractions_dict[match.group(0)]

            return c_re.sub(replace, text)

        self.word2vec = KeyedVectors.load_word2vec_format(
            "word2vec_twitter_tokens.bin",
            unicode_errors='ignore',
            binary=True)

        count = Counter()
        for author, sentence, type in zip(self.tweets['author'],
                                          self.tweets['sentence'],
                                          self.tweets['type']):
            if re.match("(\w+:\/\/\S+)", sentence) == None:
                sentence = ' '.join(
                    re.sub(
                        "(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)",
                        " ", sentence).split())
                author = ' '.join(
                    re.sub(
                        "(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)",
                        " ", author).split())

                sentence = re.sub('<.*?>', '', sentence)
                author = re.sub('<.*?>', '', author)
                sentence = ftfy.fix_text(sentence)
                author = ftfy.fix_text(author)
                sentence = expand_contractions(sentence)
                author = expand_contractions(author)
                sentence = ' '.join(
                    re.sub("([^0-9A-Za-z \t])", " ", sentence).split())
                author = ' '.join(
                    re.sub("([^0-9A-Za-z \t])", " ", author).split())
                stop_words = set(stopwords.words('english'))
                word_tokens = nltk.word_tokenize(sentence)
                filtered_sentence = [
                    w for w in word_tokens
                    if not w in stop_words and w in self.word2vec.vocab
                ]
                print(filtered_sentence)
                count.update(filtered_sentence)
                self.sentence_list.append(filtered_sentence)
                self.type_list.append(type)
                self.author_list.append(author)
        self.clean_tweets_dict = {
            j[0]: i
            for i, j in enumerate(count.most_common(12000))
        }
        self.clean_tweets_dict['UNK'] = 12001
        self.clean_tweets_dict['PAD'] = 12002
        pickle.dump(self.clean_tweets_dict, open('word_dictionary.pkl', 'wb'))
        self.spliting_data()
        self.build_word_embedding_matrix()
        self.build_model()
Beispiel #42
0
def clean_html_string(x):
    return ftfy.fix_encoding(
        ftfy.fix_text(x.replace("\n", "").replace("\t", "").strip(),
                      normalization='NFKC'))
Beispiel #43
0
 def _fix_data(self, data, fix=True):
     if fix:
         return ftfy.fix_text(unicode(data, errors='ignore'))
     return data
Beispiel #44
0

# Open the file to output the data to
file_id = 0
file = open_file(file_id)

count = 0
for doc in cursor:
    # Create a dictionary of values based on the field names
    row_input = {
        'mid':
        str(doc['_id']),  # mongo id
        'tid':
        doc['id_str'],  # tweet id
        'text':
        ftfy.fix_text(doc['text']),
        'date':
        doc['created_at'],
        'ts':
        doc['timestamp_ms'],  # timestamp
        'rt_text':
        ftfy.fix_text(doc.get('retweeted_status', {'text': ''})['text']),
        'rt_user':
        doc.get('retweeted_status', {'user': {
            'screen_name': ''
        }})['user']['screen_name'],
        'user':
        doc['user']['screen_name'],  # the user
        'hashtags': [t['text'] for t in doc['entities']['hashtags']],
        'urls': [t['expanded_url'] for t in doc['entities']['urls']],
        'mentions':
Beispiel #45
0
	def encode(self, document):
		document = self.nlp(self.standardize_text(ftfy.fix_text(document)))
		document_tokens = []
		for token in document:
			document_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
		return document_tokens
    def gen_tweets(tweets, retweets, notext, adddot, maxpages):
        r = session.get(url, headers=headers)
        pages = maxpages
        json = r.json()
        # if no number of tweets specified, all tweets from the json will be returned
        found = tweets or json['new_latent_count'] - 1

        while pages > 0 and found > 0:
            json = r.json()
            try:
                html = HTML(html=json['items_html'],
                            url='bunk',
                            default_encoding='utf-8')
            except KeyError:
                raise ValueError(
                    f'Oops! Either "{user}" does not exist or is private.')

            comma = ","
            dot = "."
            tweets = []
            for tweet in html.find('.stream-item'):
                data = tweet.find('.tweet-text')
                if len(data) < 1:
                    continue
                raw = tweet.find('.tweet-text')[0].raw_html
                text = tweet.find('.tweet-text')[0].full_text
                text = re.sub('\Shttp', ' http', text, 1)
                text = re.sub('.@', ' @', text)
                remove = 'pic.twitter.com'
                removelen = len(remove) + 11
                index = text.find(remove)
                while index > -1:
                    text = text[0:index] + text[index + removelen:]
                    index = text.find('pic.twitter.com')
                text = text.replace(u'\xa0', u' ')
                text = re.sub('[ \t\f\v]+', ' ', text)
                # fixes common encoding problems in the tweet text body
                text = fix_text(text.strip())
                tweetId = tweet.find(
                    '.js-permalink')[0].attrs['data-conversation-id']
                originaluserId = tweet.find(
                    '.js-original-tweet')[0].attrs['data-screen-name']
                time = datetime.fromtimestamp(
                    int(tweet.find('._timestamp')[0].attrs['data-time-ms']) /
                    1000.0)
                time = time.strftime("%Y-%m-%d %H:%M:%S")
                interactions = [
                    x.text for x in tweet.find('.ProfileTweet-actionCount')
                ]
                replies = interactions[0].split(" ")[0].replace(
                    comma, "").replace(dot, "") or "0"
                retweets = interactions[1].split(" ")[0].replace(
                    comma, "").replace(dot, "") or "0"
                likes = interactions[2].split(" ")[0].replace(
                    comma, "").replace(dot, "") or "0"
                hashtags = [
                    hashtag_node.full_text
                    for hashtag_node in tweet.find('.twitter-hashtag')
                ]
                urls = [
                    url_node.attrs['data-expanded-url'] for url_node in
                    tweet.find('a.twitter-timeline-link:not(.u-hidden)')
                ]
                photos = [
                    photo_node.attrs['data-image-url'] for photo_node in
                    tweet.find('.AdaptiveMedia-photoContainer')
                ]
                videos = []
                video_nodes = tweet.find(".PlayableMedia-player")
                for node in video_nodes:
                    try:
                        styles = node.attrs['style'].split()
                        for style in styles:
                            if style.startswith('background'):
                                tmp = style.split('/')[-1]
                                video_id = tmp[:tmp.index('.jpg')]
                                videos.append({'id': video_id})
                    except ValueError:
                        continue

                emoji = [
                    emoji_node.attrs['title']
                    for emoji_node in tweet.find('.Emoji')
                ]
                correcttweet = retweets == True or originaluserId.lower(
                ) == user.lower()
                tweetsize = len(text)
                accepttweet = notext == True or tweetsize > 0
                if correcttweet and accepttweet:
                    if adddot and tweetsize > 0:
                        if not (text[-1] == '!' or text[-1] == '?'
                                or text[-1] == '.'):
                            text += '.'
                    text = text.replace(' .', '.')
                    tweets.append({
                        'tweetId': tweetId,
                        'time': time,
                        'user': user,
                        'originaluser': originaluserId,
                        'text': text,
                        'replies': replies,
                        'retweets': retweets,
                        'likes': likes,
                        'entries': {
                            'hashtags': hashtags,
                            'emoji': emoji,
                            'urls': urls,
                            'photos': photos,
                            'videos': videos
                        }
                    })

            for tweet in tweets:
                if tweet and found > 0:
                    found += -1
                    yield tweet

            if json['has_more_items'] == True:
                last_tweet = html.find(
                    '.stream-item')[-1].attrs['data-item-id']
                r = session.get(url,
                                params={'max_position': last_tweet},
                                headers=headers)
                pages += -1
            else:
                # reset the count regardless since there are no more tweets left
                found = 0
Beispiel #47
0
def clean_text(text):
    text = ftfy.fix_text(text, normalization="NFKC")
    return NOA.sub(" ", text)

heads = train['title']
heads

descs = train['content']
descs


heads = heads[:50]
descs = descs[:50]


title_list = []
for i in heads:
    title = ftfy.fix_text(i)
    title_list.append(title)
    #print(title)
    #print('---------------')    
title_list


content_list = []
for i in descs:
    descs = ftfy.fix_text(i)
    content_list.append(descs)
    #print(descs)
    #print('---------------')    


content_list
Beispiel #49
0
async def trivia(cmd: SigmaCommand, message: discord.Message, args: list):
    global streaks
    if await cmd.bot.cool_down.on_cooldown(cmd.name, message.author):
        timeout = await cmd.bot.cool_down.get_cooldown(cmd.name,
                                                       message.author)
        on_cooldown = discord.Embed(
            color=0xccffff,
            title=f'❄ On cooldown for another {timeout} seconds.')
        await message.channel.send(embed=on_cooldown)
        return

    try:
        if message.author.id not in ongoing_list:
            ongoing_list.append(message.author.id)
            allotted_time = 20
            trivia_api_url = 'https://opentdb.com/api.php?amount=1'
            cat_chosen = False
            if args:
                catlook = args[-1].lower()
                for cat in categories:
                    cat_alts = categories.get(cat)
                    if catlook in cat_alts:
                        trivia_api_url += f'&category={cat}'
                        cat_chosen = True
                        break
                diflook = args[0].lower()
                if diflook in ['easy', 'medium', 'hard']:
                    trivia_api_url += f'&difficulty={diflook}'
                    cat_chosen = True
            async with aiohttp.ClientSession() as session:
                async with session.get(trivia_api_url) as number_get:
                    number_response = await number_get.read()
                    try:
                        data = json.loads(number_response).get('results')[0]
                    except json.JSONDecodeError:
                        if message.author.id in ongoing_list:
                            ongoing_list.remove(message.author.id)
                        decode_error = discord.Embed(
                            color=0xBE1931,
                            title='❗ Couldn\'t retrieve a question.')
                        await message.channel.send(embed=decode_error)
                        return
            await cmd.bot.cool_down.set_cooldown(cmd.name, message.author, 30)
            question = data['question']
            question = ftfy.fix_text(question)
            question = re.sub(r'([*_~`])', r'\\\1',
                              question)  # escape markdown formatting
            category = data['category']
            correct_answer = data['correct_answer']
            correct_answer = ftfy.fix_text(correct_answer)
            incorrect_answers = data['incorrect_answers']
            difficulty = data['difficulty']
            reward_mult = streaks.get(
                message.author.id) or 0 if not cat_chosen else 0
            kud_reward = int(
                (awards.get(difficulty) or '10') *
                (1 + (reward_mult * 3.25) / (1 + (0.03 * reward_mult))))
            choice_list = [correct_answer] + incorrect_answers
            choice_list = shuffle_questions(choice_list)
            choice_number = 0
            choice_lines = []
            for choice in choice_list:
                choice_number += 1
                choice_line = f'[{choice_number}] {choice}'
                choice_lines.append(choice_line)
            choice_text = '\n'.join(choice_lines)
            choice_text = ftfy.fix_text(choice_text)
            if difficulty == 'easy':
                starter = 'An'
            else:
                starter = 'A'
            question_embed = discord.Embed(color=0xF9F9F9,
                                           title='❔ Here\'s a question!')
            question_embed.description = f'{starter} {difficulty} one from the {category} category.'
            question_embed.add_field(name='Question',
                                     value=question,
                                     inline=False)
            question_embed.add_field(name='Choices',
                                     value=f'```py\n{choice_text}\n```',
                                     inline=False)
            question_embed.set_footer(
                text='Input the number of your chosen answer.')
            question_embed.set_author(name=message.author.display_name,
                                      icon_url=user_avatar(message.author))
            await message.channel.send(embed=question_embed)

            def check_answer(msg):
                if message.channel.id != msg.channel.id:
                    return
                if message.author.id != msg.author.id:
                    return
                if msg.content.isdigit():
                    if abs(int(msg.content)) <= len(choice_lines):
                        return True
                    else:
                        return
                elif msg.content.title() in choice_list:
                    return True

            try:
                answer_message = await cmd.bot.wait_for('message',
                                                        check=check_answer,
                                                        timeout=allotted_time)
                try:
                    answer_index = int(answer_message.content) - 1
                except ValueError:
                    answer_index = None
                correct_index = get_correct_index(choice_list, correct_answer)
                if answer_index == correct_index or answer_message.content.lower(
                ) == correct_answer.lower():
                    if cat_chosen:
                        streaks.update(
                            {message.author.id: reward_mult + 0.005})
                    else:
                        streaks.update({message.author.id: reward_mult + 1})
                    await cmd.db.add_currency(answer_message.author,
                                              message.guild, kud_reward)
                    author = answer_message.author.display_name
                    currency = cmd.bot.cfg.pref.currency
                    win_title = f'🎉 Correct, {author}, it was {correct_answer}. You won {kud_reward} {currency}!'
                    final_embed = discord.Embed(color=0x77B255,
                                                title=win_title)
                else:
                    if message.author.id in streaks:
                        streaks.pop(message.author.id)
                    lose_title = f'💣 Ooh, sorry, it was {correct_answer}...'
                    final_embed = discord.Embed(color=0x262626,
                                                title=lose_title)
                await message.channel.send(embed=final_embed)
            except asyncio.TimeoutError:
                if message.author.id in streaks:
                    streaks.pop(message.author.id)
                timeout_title = f'🕙 Time\'s up! It was {correct_answer}...'
                timeout_embed = discord.Embed(color=0x696969,
                                              title=timeout_title)
                await message.channel.send(embed=timeout_embed)
            if message.author.id in ongoing_list:
                ongoing_list.remove(message.author.id)
        else:
            ongoing_error = discord.Embed(
                color=0xBE1931, title='❗ There is already one ongoing.')
            await message.channel.send(embed=ongoing_error)
    except Exception:
        if message.author.id in ongoing_list:
            ongoing_list.remove(message.author.id)
        raise
Beispiel #50
0
    def read_sd(self, f, fref=None):
        """Read document contents from a ScienceDirect XML file."""
        def get_para_sents(p):
            if p.find('list'):
                # Really this needs to be split into the paragraph text
                # before and after the list, but BeautifulSoup is a pain, and
                # this is good enough.
                l = p.find('list').replace_with(' ... ')
                sents = [
                    re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())
                ]
                for para in l.find_all(['para', 'simple_para']):
                    sents.extend([
                        re.sub(r'\s+', ' ', x)
                        for x in st.tokenize(para.get_text())
                    ])
                return sents
            return [re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())]

        if '-ref.xml' in f:
            return

        xml = io.open(f, 'r', encoding='utf-8').read()
        xml = ftfy.fix_text(xml, uncurl_quotes=False, fix_entities=False)
        xml = strtr(xml, {'e´': 'é', 'e`': 'è'})
        xml = re.sub("([</])(dc|prism|ce|sb|xocs):", r"\1", xml)
        soup = BeautifulSoup(xml, 'lxml')

        try:
            pii = re.sub('[()-.]', '', soup.find('pii').string)
        except:
            print('No PII found for', f)
            return

        self.id = 'sd-' + pii.lower()
        self.authors = []
        try:
            for author in soup('creator'):
                x = author.string.strip()
                name = re.sub('^.*, ', '', x) + ' ' + re.sub(',.*$', '', x)
                self.authors.append(name)
        except:
            pass

        if not self.authors and soup.editor:
            self.authors = [
                x.get_text() + ' (ed.)' for x in soup.editor('authors')
            ]

        if soup.title:
            self.title = soup.title.string.strip()
        if soup.publicationname:
            self.book = soup.publicationname.string.strip()
        self.url = 'http://www.sciencedirect.com/science/article/pii/' + pii
        if soup.coverdate:
            # Dates are in format YYYY-MM-DD
            self.year = int(re.sub('-.*', '', soup.coverdate.string))

        st = SentTokenizer()
        if soup.abstract:
            sec = {
                'heading': 'Abstract',
                'text': st.tokenize(soup.find('abstract-sec').get_text())
            }
            self.sections.append(sec)

        sec_id = ''
        sec = {'text': []}
        sec_last = {'text': []}
        for p in soup.find_all(['para', 'simple-para']):
            if p.find_parents('outline'):
                continue
            elif p.find('list') and p.find('list').find('section-title'):
                continue
            elif p.find_parents('para'):
                continue
            elif p.find_parents('floats'):
                # Lest these show up at the start and be treated as an
                # abstract.
                sec_last['text'] += get_para_sents(p)
                continue
            if p.parent.name in ['section', 'biography']:
                p_sec_id = p.parent.get('id', '')
                if p_sec_id != sec_id:
                    if sec['text']:
                        self.sections.append(sec)
                    sec = {'text': []}
                    sec_id = p_sec_id
                    heading = p.parent.find('section-title')
                    if heading and heading.string:
                        sec['heading'] = heading.string.strip()
                    elif p.parent.name == 'biography':
                        sec['heading'] = 'Biography'
            sec['text'] += get_para_sents(p)
        if sec['text']:
            self.sections.append(sec)
        if sec_last['text']:
            self.sections.append(sec_last)

        if soup.rawtext and len(self.sections) < 3:
            self.sections.append(
                {'text': st.tokenize(soup.rawtext.get_text())})

        if len(self.text()) < 200:
            print(' ! Skip:', self.title, self.id + '. Missing text.')
            return

        if not fref:
            fref = f.replace('-full.xml', '-ref.xml')

        if os.path.exists(fref):
            reftext = io.open(fref, 'r', encoding='utf-8').read()
            self.references = set([
                x.replace('PII:', 'sd-').lower()
                for x in re.findall('PII:[^<]+', reftext)
            ])
Beispiel #51
0
import json
import argparse
from ftfy import fix_text
from pprint import pprint

parser = argparse.ArgumentParser()
parser.add_argument('--source-file', required=True)
args = parser.parse_args()

with open(args.source_file) as data_file:
    game_data = json.load(data_file)

rounds = ('jeopardy', 'double-jeopardy')

for round in rounds:
    for category in game_data[round]:
        category['name'] = fix_text(category['name'].upper())
        for q in category['questions']:
            q['question'] = fix_text(q['question'].upper())
            q['answer'] = fix_text(q['answer'])

game_data['final-jeopardy']['category'] = fix_text(game_data['final-jeopardy']['category'].upper())
game_data['final-jeopardy']['question'] = fix_text(game_data['final-jeopardy']['question'].upper())


name, ext = os.path.splitext(args.source_file)
outfile_name = "{name}_{uid}{ext}".format(name=name, uid='formatted', ext=ext)

with open(outfile_name, 'w') as outfile:
    json.dump(game_data, outfile, indent=4)
Beispiel #52
0
def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()
# show the output images
#cv2.imshow("Image", image)
#cv2.imshow("Output", gray)
#cv2.waitKey(5000)

# writing extracted data into a text file
text_output = open('outputbase.txt', 'w', encoding='utf-8')
text_output.write(text)
text_output.close()

file = open('outputbase.txt', 'r', encoding='utf-8')
text = file.read()
# print(text)

# Cleaning all the gibberish text
text = ftfy.fix_text(text)
text = ftfy.fix_encoding(text)
print(text)

# Initializing data variable
name = None
fname = None
dob = None
pan = None
nameline = []
dobline = []
panline = []
text0 = []
text1 = []
text2 = []
Beispiel #54
0
            import ftfy
        except ImportError:
            print(
                'To use fix_encodings the ftfy package needs to be installed. Please install it using pip'
            )
            exit()

    tagger = HUNERTagger(names=[args.name])
    split_sentences = not args.assume_sentence_splitted and not args.assume_tokenized

    with open(args.input) as f_in, open(args.output, 'w') as f_out:
        buff = []
        c = 0
        for line in f_in:
            if args.fix_encoding:
                line = ftfy.fix_text(line)
                line = line.encode('ascii', 'ignore').decode()
            if args.assume_tokenized:
                line = [line.split()]
            elif args.assume_sentence_splitted:
                line = [line]
            else:
                line = [line]
            c += 1
            buff += line
            if c % args.batchsize == 0:
                tagged_line = tagger.tag(buff,
                                         split_sentences=split_sentences,
                                         tokenize=not args.assume_tokenized)[0]
                for sentence in tagged_line:
                    for tok, tag in sentence:
        text = re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE)
        if len(text.strip()) == 0:
            continue

        clean_html = ""
        tree = ""
        try:
            if options.cleanhtml:
                # HTML is then normalized
                logging.info(url + ": cleaning HTML")
                clean_html = cleaner.clean_html(text)
            else:
                clean_html = text

            if options.ftfy:
                tree = ftfy.fix_text(clean_html, fix_entities=False, fix_character_width=False)
            else:
                tree = clean_html

        except Exception as ex:
            logging.info("Skipping " + url + ": " + str(ex))
            continue
        clean_tree = tree.replace("&#160;", " ")
        clean_tree = clean_tree.replace("\t", " ")
        clean_tree = clean_tree.encode('utf-8')
        if http_headers:
            http_headers.replace_header('Content-Length', str(len(clean_tree)))
            http_headers.replace_header('Content-Type', 'text/html')
        new_record = fo.create_warc_record(uri=url, record_type=record_type, warc_content_type=record.content_type, payload=BytesIO(clean_tree), http_headers=http_headers)
        fo.write_record(new_record)
Beispiel #56
0
def get_title(entry: Dict[str, Any]) -> Optional[str]:
    title = entry.get("title")
    if title:
        return ftfy.fix_text(title)
    else:
        return None
Beispiel #57
0
def clean_text(text, remove_newlines=False) -> str:
    cleaned = fix_text(BeautifulSoup(text, "lxml").text.strip())
    if remove_newlines:
        cleaned = re.sub(newline_regex, " ", cleaned)
    return cleaned
Beispiel #58
0
with open('pilgub.csv', 'a', newline='', encoding='UTF-8') as f:
    thewriter = csv.writer(f)
    #    first write csv
    thewriter.writerow(["tweet", "clean_tweet", "sentiment"])
    for tweet in tweepy.Cursor(api.search,
                               q=query + "-filter:retweets",
                               lang="id",
                               show_user="******",
                               since='2018-06-04',
                               until='2018-07-09').items(max_tweets):
        #process single status
        if 'RT ' not in tweet.text:
            rt = rt + 1

        texts = fix_text(tweet.text,
                         remove_terminal_escapes=True,
                         remove_bom=True,
                         remove_control_chars=True)
        if '…' in texts:
            texts = texts.rsplit(' ', 1)[0]
        #remove symbol unicode
        texts = texts.encode('ascii', 'ignore').decode('utf-8')
        """
        if preprocess_tweet(texts) == ' ':
            continue
        if preprocess_tweet(texts) == '':
            continue      
        """
        print("============================")
        print(tweet.retweeted)
        print("============================")
        print("Ver: Ori")
Beispiel #59
0
def test_lix(text, expected, nlp):
    text = ftfy.fix_text(text)
    text = " ".join(text.split())
    doc = nlp(text)
    assert pytest.approx(expected, rel=1e-2) == doc._.readability["lix"]
Beispiel #60
0
 def open_file(self) -> Iterable:
     raw_file = open(self.file_path(), "rb").read().decode(errors="replace")
     fixed_file = ftfy.fix_text(raw_file)
     reader = csv.DictReader(fixed_file.split("\n"), delimiter="\t")
     return reader