Exemple #1
0
def make_concept_uri(text, lang, disambiguation=None):
    text = ftfy.ftfy(text).strip()
    if disambiguation is None:
        text, disambiguation = handle_disambig(text)
    if disambiguation is not None:
        if isinstance(disambiguation, str):
            disambiguation = disambiguation.decode("utf-8")
        disambiguation = ftfy.ftfy(disambiguation)

    if lang == "en":
        normalized = english.normalize(text)
    elif lang == "ja" and disambiguation is not None:
        match = re.search(r"\((.*?)\)", disambiguation)
        if match:
            parenthesized = match.group(1)
            pos, rest = disambiguation.split("/", 1)
            if parenthesized in JAPANESE_PARTS_OF_SPEECH:
                pos = JAPANESE_PARTS_OF_SPEECH[parenthesized]
            else:
                pos = "n"
            disambiguation = pos + "/" + re.sub(r"\s*\((.*?)\)\s*", "", rest)
        normalized = preprocess_text(text).lower()
    else:
        normalized = preprocess_text(text).lower()

    if disambiguation is not None:
        disambiguation = disambiguation.strip().replace(" ", "_").lower()
    if disambiguation:
        return "/c/%s/%s/%s" % (lang, normalized.replace(" ", "_"), disambiguation)
    else:
        return "/c/%s/%s" % (lang, normalized.replace(" ", "_"))
Exemple #2
0
def make_concept_uri(text, lang, disambiguation=None):
    text = ftfy.ftfy(text).strip()
    if disambiguation is None:
        text, disambiguation = handle_disambig(text)
    if disambiguation is not None:
        if isinstance(disambiguation, str):
            disambiguation = disambiguation.decode('utf-8')
        disambiguation = ftfy.ftfy(disambiguation)

    if lang == 'en':
        normalized = normalize(text)
    elif lang == 'ja' and disambiguation is not None:
        match = re.search(r'\((.*?)\)', disambiguation)
        if match:
            parenthesized = match.group(1)
            pos, rest = disambiguation.split('/', 1)
            if parenthesized in JAPANESE_PARTS_OF_SPEECH:
                pos = JAPANESE_PARTS_OF_SPEECH[parenthesized]
            else:
                pos = 'n'
            disambiguation = pos + '/' + re.sub(r'\s*\((.*?)\)\s*', '', rest)
        normalized = text.lower()
    else:
        normalized = text.lower()

    if disambiguation is not None:
        disambiguation = disambiguation.strip().replace(' ', '_').lower()
    if disambiguation:
        return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambiguation)
    else:
        return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
Exemple #3
0
def make_concept_uri(text, lang, disambiguation=None):
    text = ftfy.ftfy(text)
    if disambiguation is None:
        text, disambiguation = handle_disambig(text)
    if disambiguation is not None:
        if isinstance(disambiguation, str):
            disambiguation = disambiguation.decode('utf-8')
        disambiguation = ftfy.ftfy(disambiguation)

    if lang == 'en':
        normalized = english.normalize(text)
    elif lang == 'ja' and disambiguation is not None:
        match = re.search(r'\((.*?)\)', disambiguation)
        if match:
            parenthesized = match.group(1)
            pos, rest = disambiguation.split('/', 1)
            if parenthesized in JAPANESE_PARTS_OF_SPEECH:
                pos = JAPANESE_PARTS_OF_SPEECH[parenthesized]
            else:
                pos = 'n'
            disambiguation = pos + '/' + re.sub(r'\s*\((.*?)\)\s*', '', rest)
        normalized = preprocess_text(text).lower()
    else:
        normalized = preprocess_text(text).lower()

    if disambiguation is not None:
        disambiguation = disambiguation.replace(' ', '_')
    if disambiguation:
        return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambiguation)
    else:
        return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
Exemple #4
0
def load_data(messages_directory):
    for threadname in os.listdir(messages_directory):
        if threadname.startswith('.'):
            continue
        if threadname == 'stickers_used':
            continue
        # the messages are stored in the "message.json" file in the threadname directory
        # we want to load those json dictionaries, put them in the threads list, and do some data conversion
        filename = os.path.join(messages_directory, threadname, "message.json")
        threaddict = {}
        with open(filename) as f:
            threaddict = json.load(f)
        # alter every message a bit
        for message in threaddict['messages']:
            # now convert the timestamp into a real datetime object
            message['date'] = datetime.fromtimestamp(message['timestamp'])
            # for consistency, change sender_name to sender
            message['sender'] = message['sender_name']
            if 'content' not in message:
                message['content'] = ''
            message['content'] = ftfy.ftfy(message['content'])
            message['sender'] = ftfy.ftfy(message['sender'])
        # for consistency, copy participants to members
        threaddict['members'] = []
        if 'participants' in threaddict:
            for participant in threaddict['participants']:
                threaddict['members'].append(ftfy.ftfy(participant))
        threaddict['title'] = ftfy.ftfy(threaddict['title'])
        threaddict['members'].append(setup.user)
        # add index
        global threads
        threaddict['index'] = len(threads)
        threads.append(threaddict)
Exemple #5
0
def safe_path(origtitle):
    title = safe_path_component(ftfy(origtitle))

    if len(title) == 0:
        title = origtitle = u'_'

    if title.startswith(u'-') or title.startswith(u'.'):
        title = u'_' + title
    try:
        charname = safe_path_component(unicodedata.name(origtitle[0]))
    except ValueError:
        charname = u'UNKNOWN'
    category = charname.split('_')[0]

    # some ridiculous stuff to give every article a unique name that can be
    # stored on multiple file systems and tab-completed
    if len(origtitle) == 1:
        pieces = [u'single_character', category, charname + '.json']
    else:
        try:
            charname2 = safe_path_component(unicodedata.name(origtitle[1]))
        except ValueError:
            charname2 = u'UNKNOWN'
        text_to_encode = unicodedata.normalize("NFKD",
                                               safe_path_component(title[:64]))
        finalpart = text_to_encode.encode('punycode').rstrip('-')
        pieces = [charname, charname2, finalpart + '.json']
    path = u'/'.join(pieces)
    return path
Exemple #6
0
def leeds_corpus_frequencies(corpusfile, stemmer):
    if stemmer is None:
        stemmer = lambda x: x

    infile = codecs.open(corpusfile, encoding='utf-8')

    freqs = defaultdict(int)
    tokenfreqs = defaultdict(int)
    for line in infile:
        line = ftfy(line.strip())
        if line:
            rank = line.split(' ')[0]
            if NUMBER_RE.match(rank) and line.count(' ') == 2:
                rank, freq, token = line.split(' ')
                stemmed = stemmer(token)
                print "%s -> %s" % (token, stemmed)
                freq = float(freq)
                freq_int = int(freq*100)
                for word in stemmed.split(' '):
                    if ',' not in word:
                        freqs[word] += freq_int
                if ',' not in token:
                    tokenfreqs[token.lower()] += freq_int
    for key in tokenfreqs:
        if tokenfreqs[key] > freqs[key]:
            freqs[key] = tokenfreqs[key]
    return freqs
def safe_path(origtitle):
    title = safe_path_component(ftfy(origtitle))
    
    if len(title) == 0:
        title = origtitle = u'_'

    if title.startswith(u'-') or title.startswith(u'.'):
        title = u'_' + title
    try:
        charname = safe_path_component(unicodedata.name(origtitle[0]))
    except ValueError:
        charname = u'UNKNOWN'
    category = charname.split('_')[0]

    # some ridiculous stuff to give every article a unique name that can be
    # stored on multiple file systems and tab-completed
    if len(origtitle) == 1:
        pieces = [u'single_character', category, charname + '.json']
    else:
        try:
            charname2 = safe_path_component(unicodedata.name(origtitle[1]))
        except ValueError:
            charname2 = u'UNKNOWN'
        text_to_encode = unicodedata.normalize("NFKD", safe_path_component(title[:64]))
        finalpart = text_to_encode.encode('punycode').rstrip('-')
        pieces = [charname, charname2, finalpart + '.json']
    path = u'/'.join(pieces)
    return path
Exemple #8
0
    def load(self, path_to_file):
        """Loads .txt file from `path_to_file`.

        Arguments:
            path_to_file (pathlib.Path):
                Path to .txt file

        Returns:
            doc (chomskIE.utils.Document)
                Document object corresponding to .txt file in `path_to_file`.
        """
        if not self._validate_data_path(path_to_file, is_directory=False):
            raise PathError(f'{path_to_file} is not a valid file path.')

        try:
            text_obj = open(path_to_file, 'r')
            text = text_obj.read()
        except UnicodeDecodeError:
            text_obj = open(path_to_file, 'rb')
            text, _ = ftfy.guess_bytes(text_obj.read())

        text = ftfy.ftfy(text)
        name = str(path_to_file).split('/')[-1]
        paragraphs = [p.strip() for p in text.splitlines() if p]

        doc = Document(name=name, text=text, paragraphs=paragraphs)
        return doc
Exemple #9
0
    def load(self, english_model, path_to_file):
        """Loads .txt file from `path_to_file`.

        Arguments:
            english_model (spacy.lang)
                Trained SpaCy language pipeline.)
            path_to_file (pathlib.Path):
                Path to .txt file

        Returns:
            doc, spacy_doc (tuple)
                ``doc`` is a ``chomskIE.utils.Document`` object corresponding
                to .txt file in `path`.

                ``spacy_doc`` is a ``spacy.tokens.Document`` object corresponding
                to .txt files in `path` processed by ``english_model``.
        """
        if not self._validate_data_path(path_to_file, is_directory=False):
            raise PathError(f'{path_to_file} is not a valid file path.')

        try:
            text_obj = open(path_to_file, 'r')
            text = text_obj.read()
        except UnicodeDecodeError:
            text_obj = open(path_to_file, 'rb')
            text, _ = ftfy.guess_bytes(text_obj.read())

        text = ftfy.ftfy(text)
        name = str(path_to_file).split('/')[-1]

        spacy_doc = english_model(text)
        doc = Document(name=name, text=None, paragraphs=None)

        return doc, spacy_doc
Exemple #10
0
def _read_csv(reader, header, encoding):
    """
    Given a constructed CSV reader object, a header row that we've read, and
    a detected encoding, yield its rows as dictionaries.
    """
    for row in reader:
        if len(row) == 0:
            continue
        row = [ftfy(cell.decode(encoding, 'replace')) for cell in row]
        row_list = zip(header, row)
        row_dict = dict(row_list)
        if len(row_dict['text']) == 0:
            continue
        row_dict['text'] = unicodedata.normalize(
            'NFKC', row_dict['text'].strip()
        )
        if row_dict.get('title') == '':
            del row_dict['title']
        if 'date' in row_dict:
            if row_dict['date'] == '':
                del row_dict['date']
            else:
                row_dict['date'] = int(row_dict['date'])
        if 'query' in row_dict or 'subset' in row_dict:
            queries = [cell[1] for cell in row_list
                       if cell[1] != '' and
                       (cell[0] == 'query' or cell[0] == 'subset')]
            if queries:
                row_dict['queries'] = queries
            if 'query' in row_dict:
                del row_dict['query']
            if 'subset' in row_dict:
                del row_dict['subset']
        yield row_dict
Exemple #11
0
def clean_data(data):
    '''
    Augment the raw Facebook data for our graphing use cases
    '''
    # set timezone
    data['datetime'] = pd.DatetimeIndex(
        pd.to_datetime(data['timestamp_ms'],
                       unit='ms')).tz_localize('UTC').tz_convert(
                           config.TIMEZONE)

    # column for just date
    data['date'] = data["datetime"].apply(lambda d: datetime.datetime(
        year=d.year, month=d.month, day=d.day)).map(lambda x: x.date())

    # column for term of date
    data['term'] = pd.to_datetime(
        data['datetime']).apply(lambda d: "{} {}".format(
            d.strftime('%Y'), util.to_term(int(d.strftime('%m')))))

    # clean up sticker data
    data['sticker'] = data['sticker'].apply(lambda s: s['uri']
                                            if not pd.isnull(s) else None)
    duplicate_likes = [
        "messages/stickers_used/851582_369239386556143_1497813874_n_369239383222810.png",
        "messages/stickers_used/851587_369239346556147_162929011_n_369239343222814.png"
    ]
    data['sticker'] = data['sticker'].replace(
        duplicate_likes,
        "messages/stickers_used/851557_369239266556155_759568595_n_369239263222822.png"
    )

    # format text properly
    data['content'] = data['content'].apply(lambda x: ftfy.ftfy(x)
                                            if type(x) == str else x)

    # properly set message type, adding types 'Game', 'Plan Update', 'Chat Update'
    warnings.filterwarnings("ignore", 'This pattern has match groups')
    data['game'] = data['content'].str.contains(chatstats_constants.GAME_REGEX,
                                                na=False)
    data['plan_update'] = data['content'].str.contains(
        chatstats_constants.PLAN_UPDATE_REGEX, na=False)
    data['chat_update'] = data['content'].str.contains(
        chatstats_constants.CHAT_UPDATE_REGEX, na=False)
    data['call_update'] = data['content'].str.contains(
        chatstats_constants.CALL_UPDATE_REGEX, na=False)

    data['type'] = data.apply(lambda x: clean_type(x), axis=1)

    # add first name column
    data['sender_first_name'] = data['sender_name'].apply(
        lambda s: s.split()[0])

    return data
def open_csv_somehow(filename):
    """
    Given a filename that we're told is a CSV file, detect its encoding,
    parse its header, and return a generator yielding its rows as dictionaries.

    Use the `ftfy` module internally to fix Unicode problems at the level that
    chardet can't deal with.
    """
    encoding = detect_file_encoding(filename)
    csvfile = open(filename, 'rU')
    reader = csv.reader(csvfile, dialect='excel')
    header = reader.next()
    header = [ftfy(cell.decode(encoding).lower()) for cell in header]
    return _read_csv(reader, header, encoding)
Exemple #13
0
def open_csv_somehow_py3(filename):
    encoding = detect_file_encoding(filename)
    csvfile = open(filename, 'rU', encoding=encoding)
    line = csvfile.readline()
    csvfile.seek(0)

    if '\t' in line:
        # tab-separated
        reader = csv.reader(csvfile, delimiter='\t')
    else:
        reader = csv.reader(csvfile, dialect='excel', newline='')

    header = reader.next()
    header = [ftfy(cell.lower().strip()) for cell in header]
    return _read_csv(reader, header, encoding)
Exemple #14
0
def read_leeds_corpus(filename):
    """
    Load word frequencies from a "Web as Corpus" file, collected and
    provided by the University of Leeds.

    For more information, see: http://corpus.leeds.ac.uk/list.html
    """
    infile = codecs.open(filename, encoding='utf-8')

    counts = defaultdict(float)
    for line in infile:
        line = line.rstrip()
        if line:
            rank = line.split(u' ')[0]
            if NUMBER_RE.match(rank) and line.count(u' ') == 2:
                _, freq, token = line.split(u' ')
                token = standardize_word(ftfy(token))
                freq = float(freq)
                counts[token] += freq

    return _scale_freqs(counts)
Exemple #15
0
def open_csv_somehow_py2(filename):
    """
    Open a CSV file using Python 2's CSV module, working around the deficiency
    where it can't handle the null bytes of UTF-16.
    """
    encoding = detect_file_encoding(filename)
    if encoding.startswith('UTF-16'):
        csvfile = transcode_to_utf8(filename, encoding)
        encoding = 'UTF-8'
    else:
        csvfile = open(filename, 'rU')
    line = csvfile.readline()
    csvfile.seek(0)

    if '\t' in line:
        # tab-separated
        reader = csv.reader(csvfile, delimiter='\t')
    else:
        reader = csv.reader(csvfile, dialect='excel')

    header = reader.next()
    header = [ftfy(cell.decode(encoding).lower().strip()) for cell in header]
    return _read_csv(reader, header, encoding)
Exemple #16
0
def fix_heading(heading):
    return ftfy(heading).strip('[]')
def fix_heading(heading):
    return ftfy(heading).strip("[]")
Exemple #18
0
def c_cleverbot(client, message):
    yield from client.send_message(
        message.channel,
        ftfy.ftfy(sh.mention(message) + cw.say(sh.get_args(message, True))))
def fix_heading(heading):
    return ftfy(heading).strip('[]')
Exemple #20
0
def fix(text):
    return ftfy(text).lower()
Exemple #21
0
def encodeText(text):
    return ftfy.ftfy(text)
def format(s): return ftfy.ftfy(s) if type(s) == str else s
def format_obj(obj: object):