コード例 #1
0
def tokenize_tweets(texts, segment=True, segment_vocab=None):
    tknzr = TweetTokenizer()
    token_x = [tknzr.tokenize(t) for t in texts]
    if not segment:
        return token_x

# if need to segment
    wordsegment.load()
    tokens = []
    for line in token_x:
        tokens += line
    counter = Counter(tokens)
    # identify segment-able words
    segmented = {}
    for word in counter:
        if word not in segment_vocab:
            segment = wordsegment.segment(word)
            if len(segment) > 1:
                segmented[word] = segment
    # reconstruct the list
    _token_x = []
    for line in token_x:
        _line = []
        for token in line:
            if token in segmented.keys():
                _line += segmented[token]
            else:
                _line.append(token)
        _token_x.append(_line)
    return _token_x
コード例 #2
0
 def __init__(self):
     if not os.path.exists("index"):
         os.mkdir("index")
     else:
         shutil.rmtree("index")
         os.mkdir("index")
     load()
コード例 #3
0
def expand_dict():
    # init
    all_words = set(active.keys())
    all_words.add("leetcode")
    all_words.add("yuzhouwan")
    # split
    load()
    for word in active.keys():
        for seg in segment(word):
            all_words.add(seg)
        match = re.match(r"([a-z]+)([0-9]+)", word, re.I)
        if match:
            items = match.groups()
            for item in items:
                all_words.add(item)
    all_words = sorted(all_words)
    # build
    component = Et.Element("component")
    component.set("name", "ProjectDictionaryState")
    dictionary = Et.SubElement(component, "dictionary")
    dictionary.set("name", "yuzhouwan")
    words = Et.SubElement(dictionary, "words")
    for word in all_words:
        if len(word) < 2:
            continue
        Et.SubElement(words, "w").text = word
    data = Et.tostring(component).decode("utf-8")
    # write
    with open(".idea/dictionaries/yuzhouwan.xml", "w") as dict_xml:
        dict_xml.write(xml.dom.minidom.parseString(data).toprettyxml())
コード例 #4
0
def build_preprocess(demojize, textify_emoji, mention_limit, punc_limit, lower_hashtag,
                     segment_hashtag, add_cap_sign):
    if textify_emoji and not demojize:
        raise Exception("textify_emoji is meaningless without demojize")

    funcs = [
        html.unescape,
        normalize_quotes,
        partial(tag, regex=URL_REGEX, tag=URL_TAG),
        partial(tag, regex=USER_REGEX, tag=USER_TAG),
        partial(tag, regex=NUMBER_REGEX, tag=NUMBER_TAG),
    ]

    if demojize:
        funcs.append(replace_emojis)
    if textify_emoji:
        funcs.append(textify_emojis)
    if mention_limit > 0:
        funcs.append(partial(limit_mentions, keep_num=mention_limit))
    if punc_limit > 0:
        funcs.append(partial(limit_punctuations, keep_num=punc_limit))
    if lower_hashtag:
        funcs.append(lower_hashtags)
    if segment_hashtag:
        load()
        funcs.append(segment_hashtags)
    if add_cap_sign:
        funcs.append(add_capital_signs)
    return compose(*funcs)
コード例 #5
0
def calculate_wordsegment_accuracy(verbose=False):
    def isNotSpecicalCharacter(part):
        if part in ['.', ':', '_', '~']:
            return False
        else:
            return True

    # wordsegment字典载入
    load()
    df = pd.read_csv("tmp/cheat_splitting_file.csv", header=None)
    identifiers = list(itertools.chain.from_iterable(df.values[34355:, 0:1]))
    splitted_identifiers = list(
        itertools.chain.from_iterable(df.values[34355:, 1:2]))
    lendata = len(identifiers)
    count = 0
    for i in range(lendata):
        wrong_split = True
        splitted_identifier = (splitted_identifiers[i]).lower()
        parts = splitted_identifier.split('-')
        condition = lambda part: part not in ['.', ':', '_', '~']
        parts = [x for x in filter(condition, parts)]
        wordsegmet_results = segment(identifiers[i])
        if len(parts) == len(wordsegmet_results):
            difference = list(set(parts).difference(set(wordsegmet_results)))
            if len(difference) == 0:
                count = count + 1
                wrong_split = False
        if verbose and wrong_split:
            print(parts)
            print(wordsegmet_results)

    print(count / lendata)
コード例 #6
0
def createWordEmbeddings(inFileName, num_epochs=1):
    sentences = []
    wordsegment.load()
    num_epochs = int(num_epochs)

    with open(inFileName, 'r') as csvfile:
        tweetreader = csv.reader(csvfile, delimiter='\t')
        for tweet in tweetreader:
            try:
                temp_segs = tweet[1].lower().strip().split()
                for seg in range(len(temp_segs)):
                    if ('http' in temp_segs[seg] or '@' in temp_segs[seg]):
                        temp_segs.pop(seg)
                    #elif(not (wordnet.synsets(temp_segs[seg]) or temp_segs[seg] in words.words())):
                    #    temp_segs.pop(seg)

                temp_segs = wordsegment.segment(' '.join(temp_segs))

                sentences.append(temp_segs)
            except Exception as e:
                print(e)
                print(tweet)
                continue

    model = Word2Vec(sentences, min_count=1, iter=num_epochs)
    model.save('nonOffensiveModel-NoConstraint_' + str(num_epochs) +
               'epoch.bin')
コード例 #7
0
ファイル: views.py プロジェクト: alelandt/dnse
def search_results(request):
    names = request.POST.get('search_q')
    if check_url(names):
        longitude = request.POST.get('longitude')
        latitude = request.POST.get('latitude')
        location_names = google_lookup(longitude, latitude)
        locations = list(map(strip_out, location_names))
        load()
        wlist = segment(names.split('.')[0])
        synlist = dict_lookup(wlist)
        retlist = combine_all(locations, synlist, tlds)
        templist = list(map(strip_tld, retlist))
        returnlist = []
        temp = names.split('.')[0]
        for entries in retlist:
            if SequenceMatcher(None, temp, entries).ratio() >= 0.5:
                returnlist.append(entries)
        returnlist = list(set(returnlist))
        mylist = sorted(returnlist, key=lambda x: temp, reverse=False)
        mylist = list(map(strip_space, mylist))
        finalval = check_data(mylist)
        newlist = verisign_mass_lookup(finalval)
        print(newlist)
        return JsonResponse({"retlist": newlist}, safe=False)
    else:
        return JsonResponse("", safe=False)
コード例 #8
0
def hashtags(tweets_path, out_filename):
    """
    Segment expression followed by hashtags.
    :param tweets_path: path to the file that contains tweets.
    :param out_filename: path to the file that contains hashtag expressions preprocessed.
    :return: path to the file that contains hashtag expressions preprocessed.
    """
    print('\tHandling hashtags...')
    load()
    outfile = open(out_filename, "w+")
    for tweet in open(tweets_path, "r"):
        new_tweet = []
        list_of_words = tweet.split(' ')
        for i in range(len(list_of_words)):
            word = list_of_words[i]
            if word[0] == '#':
                for w in segment(word[1:]):
                    new_tweet.append(w)
                if i == len(list_of_words) - 1:
                    new_tweet.append('\n')
            else:
                new_tweet.append(word)
        tweet_str = []
        for i in range(len(new_tweet)):
            tweet_str.append(str(new_tweet[i]))
            if i != len(new_tweet) - 1:
                tweet_str.append(' ')
        outfile.write(''.join(tweet_str))
    outfile.close()
    print('\t\tHashtags ok.')
    return out_filename
コード例 #9
0
    def get_segmented_text_column(
        self,
        comment_text,
    ):
        wordsegment.load()

        def segment_text(
            text,
        ):
            segmented_words = [
                wordsegment.segment(
                    text=word,
                )
                for word in text.split()
            ]
            seperated_words = [
                word
                for segment_text in segmented_words
                for word in segment_text
            ]

            segmented_text = ' '.join(seperated_words)

            return segmented_text

        segmented_text_column = comment_text.apply(
            lambda x: segment_text(x)
        )

        return segmented_text_column
コード例 #10
0
ファイル: g_ocr.py プロジェクト: DerickChenYR/YaleHack
def detect_text(path):
    """Detects text in the file."""

    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.types.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations
    print('Texts:')

    load()

    for text in texts:
        print('\n"{}"'.format(text.description))

        #vertices = (['({},{})'.format(vertex.x, vertex.y)
        #for vertex in text.bounding_poly.vertices])

        #print('bounds: {}'.format(','.join(vertices)))
    try:
        segmented_words = " ".join(segment(texts[0].description))
    except:
        segmented_words = ""

    return segmented_words.upper()
コード例 #11
0
    def __init__(self,
                 train_data='1_train_CensoredRedditData_ratio_15.0.tsv',
                 trained_model='NULI.pt',
                 params_file='NULI_params.json'):
        #x_train, y_train, x_test, labelNum, testTweets, labelsAsNums, numsAsLabels, max_seq_length = load_dataset(train_data)

        wordsegment.load()

        # load in params
        params_in = open(params_file)
        params_lines = params_in.readlines()
        params = json.loads(params_lines[0])

        self.labelNum = params['labelNum']
        self.labelsAsNums = params['labelsAsNums']
        self.numsAsLabels = params['numsAsLabels']
        self.max_seq_length = params['max_seq_length']

        # Load pre-trained tokenizer (vocabulary)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # load pre-trained model
        self.model = torch.load(trained_model)
コード例 #12
0
ファイル: search.py プロジェクト: eode/quilt-compiler
def tokenize(string):
    """
    Split the input several times, returning intermediate results at each level:
    - delimited by underscores
    - letter/number boundaries
    - word segments
    E.g., tokenize('landuse_austin_tx_24dates') ->
        ['landuse', 'land', 'use', 'austin', 'tx', '24dates', '24', 'dates']

    (Don't need a token for the original string because to_tsvector splits on underscores.)
    """
    if not wordsegment.BIGRAMS:
        # Should only happen in dev.
        wordsegment.load()

    lvl1_parts = string.split('_')
    for lvl1 in lvl1_parts:
        lvl2_parts = ALPHA_NUM_RE.findall(lvl1)
        if len(lvl2_parts) > 1:
            yield lvl1
        for lvl2 in lvl2_parts:
            lvl3_parts = wordsegment.segment(lvl2)
            if len(lvl3_parts) > 1:
                yield lvl2
            yield from lvl3_parts
コード例 #13
0
ファイル: dataUtils.py プロジェクト: zhaoyang10/language2pose
    def tokenize(desc):
        wordsegment.load()
        desc = desc.lower()

        ## remove punctuation
        desc = nltk.tokenize.WordPunctTokenizer().tokenize(desc)
        exclude = set(string.punctuation)
        desc = [''.join([c for c in ch if c not in exclude]) for ch in desc]
        desc = [ch for ch in desc if ch]

        ## word segmentor
        desc = wordsegment.segment(' '.join(desc))

        ## remove stop words
        stopwords = set(nltk.corpus.stopwords.words('english'))
        desc = [ch for ch in desc if ch not in stopwords]

        ## remove integer values
        words = []
        for ch in desc:
            try:
                int(ch)
            except:
                words.append(ch)

        ## Lemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()
        words = [wordnet_lemmatizer.lemmatize(word) for word in words]

        return ' '.join(words)
コード例 #14
0
def concat_data():
    path = os.path.dirname(os.path.abspath(__file__)) + "/data/"
    with open(path + "crawled_data.pkl", "rb") as f:
        id2entities = pickle.load(f)

    ########## Lookup Tables ##########
    labels = list(set([entity[0] for entity in id2entities.values()]))
    num_classes = len(labels)

    label_lookup = np.zeros((num_classes, num_classes), int)
    np.fill_diagonal(label_lookup, 1)
    ###################################

    text_data, context_data, label_data = [], [], []
    label_dict = {}
    for i, label in enumerate(labels):
        label_dict[label] = i

    load()
    tknzr = TweetTokenizer(reduce_len=True,
                           preserve_case=False,
                           strip_handles=False)
    print("Preprocessing tweets.....")
    for _id in id2entities:
        if id2entities[_id][0] in label_dict.keys():
            text_data.append(text_preprocess(id2entities[_id][1], tknzr))
            context_data.append(text_preprocess(id2entities[_id][2], tknzr))

            label_data.append(label_lookup[label_dict[id2entities[_id][0]]])

    assert len(text_data) == len(context_data) == len(label_data)

    return text_data, context_data, label_data
コード例 #15
0
    def domains_to_x_word(self, maxlen=None, n_words=50000):
        domains = self.domains

        if maxlen is None:
            maxlen = np.max([len(i) for i in domains])

        ws.load()
        for i in tqdm(range(len(domains))):
            domain_labels = domains[i].split(".")
            words = list()
            for j in range(len(domain_labels) - 1):
                segs = ws.segment(domain_labels[j])
                new_segs = list()
                for s in segs:
                    if s in ws.UNIGRAMS:
                        new_segs.append(s)
                    else:
                        new_segs += list(s)
                words += new_segs
            words.append(domain_labels[-1])
            domains[i] = words

        x = list()
        for domain in domains:
            x.append([
                text.one_hot(word, n_words, filters=" ")[0] for word in domain
            ])
        x = sequence.pad_sequences(x, padding='post', maxlen=maxlen)

        self.x = x
        self.n_words = n_words
        self.maxlen = maxlen
コード例 #16
0
def main():
    # wordsegment load function reads and parses the unigrams and bigrams data from disk.
    # Loading the data only needs to be done once.
    load()
    app = connexion.App(__name__, specification_dir='./swagger/')
    app.app.json_encoder = encoder.JSONEncoder
    app.add_api('swagger.yaml', arguments={'title': 'Did You Mean API'})
    app.run(port=8080)
コード例 #17
0
def segment_hashtag(text: str) -> str:
    """ Removes hastag in front of a word and add hashtag segmentation """
    text = text[1:]
    wordsegment.load()
    segments = wordsegment.segment(text)
    if len(segments) > 1:
        text = " ".join(segments)
    return text
コード例 #18
0
def stem_sentence(sentence):
    stemmer = nltk.stem.RSLPStemmer()
    load()
    stemmed = ""
    sentence = segment(sentence)
    for word in sentence:
        stemmed += stemmer.stem(word) + ' '
    return stemmed
コード例 #19
0
def solve(data):
    res = []
    load()
    for dic in data:
        ans = {}
        i, s = dic['id'], dic['encryptedText']
        ans['id'] = i
        cip = CaesarCipher(s)
        ori = cip.cracked
        nn, cnt, l, ind = 0, 0, 0, 0
        for x in range(len(s) - 1):
            for y in range(x + 1, len(s)):
                tmp = ori[x:y + 1]
                cur = y - x + 1
                if tmp == tmp[::-1]:
                    if cur > l:
                        ind, l = x, cur
                    nn += 1
        has = []
        for c in ori[ind:ind + l]:
            has.append(ord(c))
        ans['encryptionCount'] = 0
        tar = ord(s[0])
        cnt = ord(ori[0])
        if l == 0:
            for t in range(100):
                if cnt == tar:
                    ans['encryptionCount'] = t
                    break
                cnt += cnt
                if cnt > 122:
                    cnt = (cnt - 123) % 26 + 97
        else:
            for t in range(100):
                if cnt == tar:
                    ans['encryptionCount'] = t
                    break
                tmp = sum(has) + nn
                for i in range(len(has)):
                    has[i] += tmp
                    if has[i] > 122:
                        has[i] = (has[i] - 123) % 26 + 97
                cnt += tmp
                if cnt > 122:
                    cnt = (cnt - 123) % 26 + 97
        tmp = wordninja.split(ori)
        s = tmp[0]
        for i in range(1, len(tmp)):
            if tmp[i] == 'al' or tmp[i] == 'in' and tmp[
                    i -
                    1] == 'n' or tmp[i] == 'ty' and tmp[i - 1] == 'in' or tmp[
                        i] == 's' or tmp[i] == 'i' and tmp[i - 1] == 'a':
                s += tmp[i]
            else:
                s += ' ' + tmp[i]
        ans['originalText'] = s
        res.append(ans)
    return res
コード例 #20
0
    def segment_words(self, df):
        load()

        # segment combined words, such in the case of hashtags
        # e.g. #word1word2 to #word1 word2
        df['text_preprocessed'] = df['text_preprocessed'].apply(
            lambda x: ' '.join(segment(x)))

        return df
コード例 #21
0
def spell_correct(word):
    spell = SpellChecker()
    load()
    correct_spelling = ''
    words = word.split(' ')
    for wd in words:
        correct_spelling = correct_spelling + spell.correction(wd) + ' '
    correct_spelling = ' '.join(segment(correct_spelling))

    return correct_spelling
コード例 #22
0
def fc2(inputHashTags):
    # check number of word segements in the given input hashtags
    if (len(inputHashTags) == 0):
        return constant.CONSTANT_INPUT_VALIDATION_ERROR
    elif (inputHashTags[0] == constant.CONSTANT_KEYWORD_HASHTAG
          and len(inputHashTags) == 1):
        return constant.CONSTANT_INPUT_VALIDATION_ERROR
    else:
        load()
        x = segment(inputHashTags[1:])
        return len(x)
コード例 #23
0
ファイル: segmenter.py プロジェクト: afmueller/smart-rename
    def __init__(self, ngrams=None):
        ws.load()

        # add unigrams to wordsegment defaults
        if ngrams and 'unigrams' in ngrams:
            for ngram, count in ngrams['unigrams'].items():
                ws.UNIGRAMS[ngram] = count

        # add bigrams to wordsegment defaults
        if ngrams and 'bigrams' in ngrams:
            for ngram, count in ngrams['bigrams'].items():
                ws.BIGRAMS[ngram] = count
コード例 #24
0
    def __init__(self, include_tld=True, option=DomainMatchingOption.ORDER_MATCH):
        '''
        Just load the wordsegment package, whatever it is.
        '''
        wordsegment.load()

        # Save the matching option here so we can refer to it later
        self.include_tld = include_tld

        self.option = {
            DomainMatchingOption.SUBSET_MATCH: set,
            DomainMatchingOption.ORDER_MATCH: list,
        }[option]
コード例 #25
0
def collect_tags_and_decomposition(path):
    tags = pickle.load(open(path, "rb"))
    load()
    for key, value in tags.items():
        cur_list = []
        for v in value[0]:
            cur = v.split('#')[1].lower()
            cur = segment(cur)
            cur = ' '.join(cur)
            if cur:
                cur_list.append(cur)
        tags[key] = cur_list
    return tags
コード例 #26
0
def bigram(words):

    ws.load()

    values = [1.]

    for word1, word2 in zip(words[:-1], words[1:]):
        try:
            values += [np.log10(ws.BIGRAMS[' '.join([word1, word2])])]
        except:
            values += [1.]

    return values
コード例 #27
0
def unigram(words):

    ws.load()

    values = []

    for word in words:
        try:
            values += [np.log10(ws.UNIGRAMS[word])]
        except:
            values += [1.]

    return values
コード例 #28
0
ファイル: word_completer.py プロジェクト: akimous/akimous
def _initialize():
    with Timer('initializing dictionary'):
        global initialized
        # takes 500ms, 100M memory
        wordsegment.load()
        # takes 900ms, 15M memory
        c.executemany('INSERT INTO d VALUES (?,?,?)',
                      ((k[:3], k[3:], int(v))
                       for k, v in wordsegment.UNIGRAMS.items() if len(k) > 3))
        # takes 200ms, 10M memory
        c.execute('CREATE INDEX idx on d(p, f)')
        conn.commit()
        initialized = True
コード例 #29
0
def get_default_graph():
    """
    Convenience function to get default graph
    """
    cfg = json.load(open('./config.json'))
    ws.load()
    nlp = spacy.load('en_core_web_md')
    sr_df = pd.read_csv('./low_filtered_strict.csv')
    grapher = Grapher()
    grapher.create_graph_nodes_from_ilocs(sr_df, nlp)
    grapher.create_id_mapping()
    grapher.create_fc_graph(cfg)
    return grapher
コード例 #30
0
ファイル: confirm_symbol.py プロジェクト: ZhouyangJia/DepOwl
def check_source_package(symbol_problem_dict):
    global last_source
    global count

    last_source = ''
    comfirmed_row = []

    # read package-source mapping
    pkg_src_dict = {}
    mapping_file = open('pkg_src_map.txt', 'r')
    mapping_lines = mapping_file.readlines()
    mapping_file.close()
    for mapping_line in mapping_lines:
        mapping_line = mapping_line.strip()
        [package, source] = mapping_line.split(' ')
        pkg_src_dict[package] = source

    # check instances for the current source package
    conn = sqlite3.connect('depbug.db')
    cur = conn.cursor()
    cur.execute("select * from potential_depbug order by PkgName")
    rows = cur.fetchall()
    conn.close()
    load()
    tmp = 0
    for row in rows:
        count += 1
        row_data = [x.encode('ascii') for x in row[1:]]
        [
            PkgName, PkgVer, DepName, DepVer, LibName, LibObject, PreVer,
            PostVer, Direction, Severity, Symver
        ] = row_data

        if Severity == 'SymRmv':
            insert_depbug('depbug_confirm', row_data)
            insert_depbug('depbug_detect', row_data)
            continue

        if not pkg_src_dict.has_key(PkgName):
            continue
        source = pkg_src_dict[PkgName]
        my_key = [LibName, LibObject, PreVer, PostVer, Direction, Symver]
        if not symbol_problem_dict.has_key(str(my_key)):
            continue
        symbol = Symver.split('@')[0]
        # skip private data type
        if symbol in private_type:
            continue
        # break if any problem is comfirmed
        for problem in symbol_problem_dict[str(my_key)]:
            confirmed(row_data, source, symbol, problem)
コード例 #31
0
ファイル: train.py プロジェクト: vialab/semantic-guesser
from learning.pos import BackoffTagger, SpacyTagger, COCATagger
from learning.tagset_conversion import TagsetConverter
from learning.tree.wordnet import IndexedWordNetTree
from learning.model import TreeCutModel, Grammar, GrammarTagger

from pattern.en import pluralize, lexeme

from misc.util import Timer

# load global resources

log = logging.getLogger(__name__)
tag_converter = TagsetConverter()
proper_noun_tags = set(BackoffTagger.proper_noun_tags())
ws.load()


def new_wordnet_instance():
    """
    Create a new wordnet instance. This is usefult for parallel workflows.
    Multiple processes cannot access the same wordnet instance (as when imported
    globally with `from wordnet.corpus import wordnet`). This is due nltk not
    being thread-safe.
    """
    return LazyCorpusLoader(
        'wordnet', WordNetCorpusReader,
        LazyCorpusLoader('omw', CorpusReader,
                         r'.*/wn-data-.*\.tab', encoding='utf8')
    )
コード例 #32
0
import os
import sys
from .context import wordsegment
from wordsegment import (
    clean, load, main, isegment, segment, UNIGRAMS, BIGRAMS, WORDS,
)

load()

def test_unigrams():
    assert 'test' in UNIGRAMS

def test_bigrams():
    assert 'in the' in BIGRAMS

def test_clean():
    assert clean("Can't buy me love!") == 'cantbuymelove'

def test_segment_0():
    result = ['choose', 'spain']
    assert segment(''.join(result)) == result

def test_segment_1():
    result = ['this', 'is', 'a', 'test']
    assert segment(''.join(result)) == result

def test_segment_2():
    result = [
        'when', 'in', 'the', 'course', 'of', 'human', 'events', 'it',
        'becomes', 'necessary'
    ]