Python normalize_stringの例、utils.normalize_string Pythonの例

コード例 #1

0

ファイルを表示

ファイル: backuprestore.py プロジェクト: camster1/RTOTV

 def backup_skinshortcuts_properties(propertiesfile, dest_path):
     '''parse skinshortcuts properties file and translate images'''
     # look for any backgrounds and translate them
     propfile = xbmcvfs.File(propertiesfile)
     data = propfile.read()
     propfile.close()
     allprops = eval(data) if data else []
     for count, prop in enumerate(allprops):
         if prop[2] == "background":
             background = prop[3] if prop[3] else ""
             defaultid = prop[1]
             if background.endswith(".jpg") or background.endswith(
                     ".png") or background.endswith(".gif"):
                 background = get_clean_image(background)
                 extension = background.split(".")[-1]
                 newthumb = os.path.join(
                     dest_path, "%s-background-%s.%s" %
                     (xbmc.getSkinDir(), normalize_string(defaultid),
                      extension))
                 newthumb_vfs = "special://profile/addon_data/script.skinshortcuts/%s-background-%s.%s" % (
                     xbmc.getSkinDir(), normalize_string(defaultid),
                     extension)
                 if xbmcvfs.exists(background):
                     xbmcvfs.copy(background, newthumb)
                     allprops[count] = [
                         prop[0], prop[1], prop[2], newthumb_vfs
                     ]
     # write updated properties file
     propfile = xbmcvfs.File(propertiesfile, "w")
     propfile.write(repr(allprops))
     propfile.close()

コード例 #2

0

ファイルを表示

ファイル: reader.py プロジェクト: sugeerth/cnn_sentence_classification

    def __init__(self):
        self.data = []
        self.dictionary = Dictionary()
        self.max_sent_len = 0

        # Read the positive reviews
        with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f:
            positive_reviews = f.readlines()
        for review in positive_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 1))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Read the negative reviews
        with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f:
            negative_reviews = f.readlines()
        for review in negative_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 0))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Split the original dataset into train/test
        random.shuffle(self.data)
        split_index = int(0.9 * len(self.data))
        self.train = AugmentedList(self.data[:split_index])
        self.test = AugmentedList(self.data[split_index:])

コード例 #3

0

ファイルを表示

ファイル: data_cleaning.py プロジェクト: rafaelcgama/restore-uber

 def is_another_service(position):
     status = False
     if 'uber air' in normalize_string(position) or \
             'uberair' in normalize_string(position) or \
             'freight' in normalize_string(position) or \
             'elevate' in normalize_string(position):
         status = True
     return status

コード例 #4

0

ファイルを表示

ファイル: import_rover.py プロジェクト: pervrosen/FineMerge

def main():

    args = parse_args()

    global labels
    with open(args.labels) as label_file:
        labels = json.load(label_file)

    df_service = pd.read_csv(args.service_output,
                             delimiter='\t').set_index('file_name')

    df_utt = pd.read_csv(args.utterances, delimiter='\t')
    final_utterances = df_utt['file_name'].to_list()
    df_utt.set_index('file_name', inplace=True)

    smoothen_val = 1e-20
    ds2_logits = args.ds2_probs
    data = np.load(ds2_logits, allow_pickle=True)

    print("Getting ds2 transcripts")
    utterances, probs_list = zip(*data)
    ds2_transcripts = ctc_beam_decode(probs_list, labels, args.lm_path,
                                      labels.index('_'), args.lm_alpha,
                                      args.lm_beta, args.beam_size)

    ds2_data = list(zip(utterances, probs_list, ds2_transcripts))

    print("Getting ds2 word level confidence values")
    with Pool(multiprocessing.cpu_count()) as pool:
        ds2_word_confs = list(
            tqdm(pool.imap(get_word_confidence, ds2_data),
                 total=len(ds2_data)))

    with open(args.output_path, 'w') as fd:
        fd.write(
            'fname\treference\tservice_transcript\tservice_confs\tds2_transcript\tds2_confs\n'
        )
        for fname in final_utterances:

            ref = normalize_string(df_utt.loc[fname]['transcript'], labels[1:])
            service_transcript, service_conf = df_service.loc[fname][[
                'transcript', 'word_confs'
            ]]
            norm_service_transcript = normalize_string(service_transcript,
                                                       labels[1:])
            aligned_service_conf = align_word_confs(service_transcript,
                                                    norm_service_transcript,
                                                    service_conf)
            aligned_service_conf = ' '.join(aligned_service_conf)
            # norm_service_transcript, aligned_service_conf = parse_text(service_transcript, service_conf)
            ds2_conf = ds2_word_confs[utterances.index(fname)]
            ds2_conf = ' '.join([str(x) for x in ds2_conf])
            ds2_transcript = ds2_transcripts[utterances.index(fname)]
            fd.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(fname, ref, norm_service_transcript, \
                aligned_service_conf, ds2_transcript, ds2_conf))

コード例 #5

0

ファイルを表示

ファイル: import_fineMerge_GoogAPI.py プロジェクト: pervrosen/FineMerge

def main():

    parser = argparse.ArgumentParser(
        description="Generate dataset for FineMerge")
    parser.add_argument(
        "--ds2_probs",
        help=
        "Path to frame-level token probs pkl file obtained from final layer of ds2,"
        "should be list of tuples containing (file_names, probs)",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--service_output",
        help="Path to ASR service output",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--labels",
        help=
        "Path to labels json files containing ordered (w.r.t. to ds2 output) list of output labels",
        type=str,
        required=False,
        default='../labels_char.json')
    parser.add_argument(
        "--output_path",
        help="Path to save the output pickle file",
        type=str,
        required=True,
    )

    args = parser.parse_args()

    with open(args.labels) as label_file:
        labels = json.load(label_file)

    service_map = {}
    with open(args.service_output) as fd:
        lines = fd.read().splitlines()

    for _, line in tqdm(enumerate(lines[1:]), total=len(lines[1:])):
        file_name, transcript, word_confs = line.split('\t')
        norm_transcript = normalize_string(transcript, labels[1:])
        aligned_word_confs = align_word_confs(transcript, norm_transcript,
                                              word_confs)
        # norm_transcript, aligned_word_confs = parse_text(transcript, word_confs)
        aligned_word_confs = [float(conf) for conf in aligned_word_confs]
        service_map[file_name] = {'service_transcript' : norm_transcript, \
            'word_confs' : aligned_word_confs}

    dataset = {}
    data = np.load(args.ds2_probs, allow_pickle=True)
    for file_name, probs in data:
        if file_name in service_map.keys():
            dataset[file_name] = service_map[file_name]
            dataset[file_name]['ds2_probs'] = probs

    with open(args.output_path, 'wb') as fd:
        pickle.dump(dataset, fd)

コード例 #6

0

ファイルを表示

 def get_custom_path(self, searchtitle, title):
     """locate custom folder on disk as pvrart location"""
     title_path = ""
     custom_path = self._mutils.addon.getSetting("pvr_art_custom_path")
     if custom_path and self._mutils.addon.getSetting(
             "pvr_art_custom") == "true":
         delim = "\\" if "\\" in custom_path else "/"
         dirs = xbmcvfs.listdir(custom_path)[0]
         for strictness in [1, 0.95, 0.9, 0.8]:
             if title_path:
                 break
             for directory in dirs:
                 if title_path:
                     break
                 if sys.version_info.major < 3:
                     directory = directory.decode("utf-8")
                 curpath = os.path.join(custom_path, directory) + delim
                 for item in [title, searchtitle]:
                     match = SM(None, item, directory).ratio()
                     if match >= strictness:
                         title_path = curpath
                         break
         if not title_path and self._mutils.addon.getSetting(
                 "pvr_art_download") == "true":
             title_path = os.path.join(custom_path,
                                       normalize_string(title)) + delim
     return title_path

コード例 #7

0

ファイルを表示

ファイル: preprocess.py プロジェクト: zbxzc35/medical_caption

def stat_lang(findings_file):
    with open(findings_file, 'r') as f:
        findings = json.load(f)

    sent_max_num = 0
    word_max_num = 0
    sent_max_report = None
    word_max_report = None

    for item in findings:
        report = item['report']
        caption = normalize_string(report)
        caption = [
            sent.strip() for sent in caption.split(' .')
            if len(sent.strip()) > 0
        ]
        if sent_max_num < len(caption):
            sent_max_num = len(caption)
            sent_max_report = caption

        for sent in caption:
            words = sent.split()
            if word_max_num < len(words):
                word_max_num = len(words)
                word_max_report = sent
    print('max sentence number is {} and max words num is {}'.format(
        sent_max_num, word_max_num))
    print(sent_max_report)
    print(word_max_report)

コード例 #8

0

ファイルを表示

def main():

    args = parse_args()
    global saved_output, references, labels

    with open(args.labels) as label_file:
        labels = json.load(label_file)

    saved_output = dict(np.load(args.saved_output, allow_pickle=True))
    df_utt = pd.read_csv(args.utterances, delimiter='\t')
    utterances = df_utt['file_name'].to_list()
    references = df_utt['transcript'].to_list()
    references = [normalize_string(text, labels[1:]) for text in references]
    saved_output = [saved_output[utt] for utt in utterances]

    p = Pool(multiprocessing.cpu_count(), init,
             [labels, args.lm_path, args.beam_width])
    cand_alphas = np.linspace(args.lm_alpha_from, args.lm_alpha_to,
                              args.lm_num_alphas)
    cand_betas = np.linspace(args.lm_beta_from, args.lm_beta_to,
                             args.lm_num_betas)
    params_grid = [(float(alpha), float(beta)) for alpha in cand_alphas
                   for beta in cand_betas]
    scores = []
    for params in tqdm(p.imap(decode_dataset, params_grid),
                       total=len(params_grid)):
        scores.append(list(params))
    print("Saving tuning results to finetune.".format(args.output_path))
    with open(args.output_path, "w") as fh:
        json.dump(scores, fh)

    min_results = min(scores, key=lambda x: x[2])
    print("Best Params:\nAlpha: %f \nBeta: %f \nWER: %f" % tuple(min_results))

コード例 #9

0

ファイルを表示

ファイル: manager_info_parser.py プロジェクト: gppeixoto/soccermarket

 def parse_row(self, row):
     text = normalize_string(row.text)
     sibling = row.getnext()
     if 'name' in text:
         self.name = normalize_string(sibling.text)
     elif 'age' in text and len(text)<=4:
         self.age = sibling.text
     elif 'nationality' in text:
         nationality = sibling.find('img').get('alt')
         self.nationality = normalize_string(nationality)
     elif 'formation' in text:
         self.preferred_formation = sibling.text
     elif 'success rate' in text:
         self.win_percentage = sibling.\
                 xpath(queries["win_percentage"])[0]\
                 .text\
                 .replace(',', '.')

コード例 #10

0

ファイルを表示

 def read_words(filename: str, max_length: int) -> List[Word]:
     words: List[Word] = list()
     with open(filename, mode='rt', encoding='utf8') as f:
         for line in f:  # type: str
             for word in normalize_string(
                     line).strip().split():  # type: str
                 if len(word) <= max_length:
                     words.append(Word(list(word)))
     return words

コード例 #11

0

ファイルを表示

def main():

    parser = argparse.ArgumentParser(
        description=
        "create pickle file containing the topN transcipts for each utterance from google API"
    )
    parser.add_argument(
        "--json_dir",
        help="Path to dir containing output json from Google API",
        type=str,
        required=True,
    )
    parser.add_argument("--labels",
                        help="Path to ds2 output labels",
                        type=str,
                        required=False,
                        default='../labels_char.json')
    parser.add_argument(
        "--output_path",
        help="Path to save output pickle file",
        type=str,
        required=True,
    )
    args = parser.parse_args()

    dataset = {}
    json_files = glob.glob(join(args.json_dir, '*.json'))
    with open(args.labels) as label_file:
        labels = json.load(label_file)

    for json_pth in json_files:

        file_name = basename(json_pth).replace('.json', '.wav')
        try:
            with open(json_pth) as fd:
                data = json.load(fd)
        except:
            print(json_pth)
            continue

        for rank, alt in enumerate(data['results'][0]['alternatives']):

            transcript = normalize_string(alt['transcript'], labels[1:])
            # transcript = parse_text(alt['transcript'])
            confidence = alt['confidence']
            if rank == 0:
                dataset[file_name] = {
                    'transcripts': [transcript],
                    'confidences': [confidence]
                }
            else:
                dataset[file_name]['transcripts'].append(transcript)
                dataset[file_name]['confidences'].append(confidence)

    with open(args.output_path, 'wb') as fd:
        pickle.dump(dataset, fd)

コード例 #12

0

ファイルを表示

ファイル: manager_info_parser.py プロジェクト: gppeixoto/soccermarket

 def parse_info(self):
     rows = self.get_info_table().findall('tr')
     rows = map(lambda row: row.find('th'), rows)
     for row in rows:
         self.parse_row(row)
     if self.name == '' or self.name == None:
         name_attempt = self.tree.xpath("//div[@itemprop='name']/h1")
         if name_attempt == '' or name_attempt == None:
             raise Exception("manager parsing error") 
         else:
             self.name = normalize_string(name_attempt[0].text)

コード例 #13

0

ファイルを表示

ファイル: backuprestore.py プロジェクト: camster1/RTOTV

 def backup_skinshortcuts_images(shortcutfile, dest_path):
     '''parse skinshortcuts file and copy images to backup location'''
     shortcutfile = xbmc.translatePath(shortcutfile).decode("utf-8")
     doc = parse(shortcutfile)
     listing = doc.documentElement.getElementsByTagName('shortcut')
     for shortcut in listing:
         defaultid = shortcut.getElementsByTagName('defaultID')
         if defaultid:
             defaultid = defaultid[0].firstChild
             if defaultid:
                 defaultid = defaultid.data
             if not defaultid:
                 defaultid = shortcut.getElementsByTagName(
                     'label')[0].firstChild.data
             thumb = shortcut.getElementsByTagName('thumb')
             if thumb:
                 thumb = thumb[0].firstChild
                 if thumb:
                     thumb = thumb.data
                     if thumb and (thumb.endswith(".jpg")
                                   or thumb.endswith(".png")
                                   or thumb.endswith(".gif")):
                         thumb = get_clean_image(thumb)
                         extension = thumb.split(".")[-1]
                         newthumb = os.path.join(
                             dest_path, "%s-thumb-%s.%s" %
                             (xbmc.getSkinDir(),
                              normalize_string(defaultid), extension))
                         newthumb_vfs = "special://profile/addon_data/script.skinshortcuts/%s-thumb-%s.%s" % (
                             xbmc.getSkinDir(), normalize_string(defaultid),
                             extension)
                         if xbmcvfs.exists(thumb):
                             xbmcvfs.copy(thumb, newthumb)
                             shortcut.getElementsByTagName(
                                 'thumb')[0].firstChild.data = newthumb_vfs
     # write changes to skinshortcuts file
     shortcuts_file = xbmcvfs.File(shortcutfile, "w")
     shortcuts_file.write(doc.toxml(encoding='utf-8'))
     shortcuts_file.close()

コード例 #14

0

ファイルを表示

ファイル: app.py プロジェクト: RedisAI/ChatBotDemo

def chat():
    req_data = request.get_json()
    message = utils.normalize_string((req_data['message']))
    try:
        indices = utils.get_batched_indices(message)
    except KeyError:
        reply = "I did not understand your language!!, check the spelling perhaps"
    else:
        numpy_array = utils.list2numpy(indices)
        reply = redis_db.process(numpy_array)

    resp = jsonify(reply=reply)
    return resp

コード例 #15

0

ファイルを表示

 def next_batch(self, batch_size, mode=TRAIN_MODE):
     review_lengths, reviews, targets = [], [], []
     data = self.train if mode == TRAIN_MODE else self.test
     batch = data.next_items(batch_size)
     for (review, target) in batch:
         review_length = len(word_tokenize(normalize_string(review)))
         review = indexes_from_sentence(review, self.dictionary,
                                        self.max_sent_len)
         target = one_hot_encoding(2, target)
         reviews.append(review)
         targets.append(target)
         review_lengths.append(review_length)
     return review_lengths, reviews, targets

コード例 #16

0

ファイルを表示

ファイル: data_cleaning.py プロジェクト: rafaelcgama/restore-uber

def clean_data(mylist):
    """
    Removes non-relevant entries
    :param mylist: list of dict
    :return: a list of dict
    """
    new_list = []
    for employee in mylist:
        name = employee['name']
        position = employee['position']
        if normalize_string(name) == 'linkedin member':
            continue

        if Conditions.meet_conditions(position):
            new_list.append(employee)

    return new_list

コード例 #17

0

ファイルを表示

def main():

    global data, utterances, references, labels, args
    args = parse_args()

    with open(args.labels) as label_file:
        labels = json.load(label_file)

    data = np.load(args.data, allow_pickle=True)
    df_utt = pd.read_csv(args.utterances, delimiter='\t')
    df_utt = df_utt[df_utt['file_name'].isin(data.keys())]  #TODO
    utterances = df_utt['file_name'].to_list()
    references = df_utt['transcript'].to_list()
    references = [normalize_string(text, labels[1:]) for text in references]

    print("Computing ctc alignments service transcripts...")
    with Pool(multiprocessing.cpu_count()) as pool:
        alignments = list(
            tqdm(pool.imap(get_alignments, utterances), total=len(utterances)))

    for utt, alignment in zip(utterances, alignments):
        data[utt]['alignment'] = alignment

    cand_ths = np.linspace(args.th_from, args.th_to, args.num_th)
    cand_service_weights = np.linspace(args.service_weight_from, \
        args.service_weight_to, args.num_service_weights)
    cand_blank_confs = np.linspace(args.blank_conf_from, \
        args.blank_conf_to, args.num_blank_confs)
    params_grid = [(th, service_weight, blank_conf) for th in cand_ths
                   for service_weight in cand_service_weights
                   for blank_conf in cand_blank_confs]

    scores = []
    with Pool(multiprocessing.cpu_count()) as pool:
        for params in tqdm(pool.imap(merge_transcripts, params_grid),
                           total=len(params_grid)):
            scores.append(list(params))

    print("Saving tuning results to finetune.".format(args.output_path))
    with open(args.output_path, "w") as fh:
        json.dump(scores, fh)

    min_results = min(scores, key=lambda x: x[-1])
    print("Best Params:\nThreshold: %.12f \nService Weight: %.2f"\
        " \nBlank Conf %.2f \nWER: %.6f" % tuple(min_results))

コード例 #18

0

ファイルを表示

ファイル: downloadutils.py プロジェクト: Alwin-Hummels/plugin.video.emby

    def getHeader(self, authenticate=True):

        clientInfo = self.clientInfo

        deviceName = clientInfo.getDeviceName()
        deviceName = utils.normalize_string(deviceName.encode('utf-8'))
        deviceId = clientInfo.getDeviceId()
        version = clientInfo.getVersion()

        if not authenticate:
            # If user is not authenticated
            auth = (
                'MediaBrowser Client="Kodi", Device="%s", DeviceId="%s", Version="%s"'
                % (deviceName, deviceId, version))
            header = {

                'Content-type': 'application/json',
                'Accept-encoding': 'gzip',
                'Accept-Charset': 'UTF-8,*',
                'Authorization': auth
            }      
            self.logMsg("Header: %s" % header, 2)
        
        else:
            userId = self.userId
            token = self.token
            # Attached to the requests session
            auth = (
                'MediaBrowser UserId="%s", Client="Kodi", Device="%s", DeviceId="%s", Version="%s"'
                % (userId, deviceName, deviceId, version))
            header = {

                'Content-type': 'application/json',
                'Accept-encoding': 'gzip',
                'Accept-Charset': 'UTF-8,*',
                'Authorization': auth,
                'X-MediaBrowser-Token': token
            }        
            self.logMsg("Header: %s" % header, 2)
        
        return header

コード例 #19

0

ファイルを表示

ファイル: preprocess.py プロジェクト: zbxzc35/medical_caption

def word_frequency(findings_file):
    with open(findings_file, 'r') as f:
        findings = json.load(f)

    word2count = {}

    for item in findings:
        caption = item['report'].strip().lower()
        caption = normalize_string(caption)
        caption = [
            sent.strip() for sent in caption.split(' .')
            if len(sent.strip()) > 0
        ]
        for sent in caption:
            for word in sent.split():

                if not re.match(r'^[a-zA-Z]+$', word):
                    continue

                word2count[word] = word2count.get(word, 0) + 1

    total = sum(word2count.values())

    word_frequency = [{
        'word': word,
        'count': count,
        'frequency': count * 1.0 / total
    } for word, count in word2count.items()]
    word_frequency.sort(key=lambda x: x['count'], reverse=True)

    frequency_sum = 0.0
    for idx, item in enumerate(word_frequency):
        frequency_sum = frequency_sum + item['frequency']
        if frequency_sum > 0.99:
            print('top {} words covers 99% occurrences'.format(idx + 1))
            break

    word_frequency = pd.DataFrame(word_frequency)
    word_frequency.to_csv('../output/preprocess/IU_Chest_XRay/words.csv',
                          index=False)

コード例 #20

0

ファイルを表示

    def getHeader(self, authenticate=True):

        clientInfo = self.clientInfo

        deviceName = clientInfo.getDeviceName()
        deviceName = utils.normalize_string(deviceName.encode('utf-8'))
        deviceId = clientInfo.getDeviceId()
        version = clientInfo.getVersion()

        if not authenticate:
            # If user is not authenticated
            auth = (
                'MediaBrowser Client="Kodi", Device="%s", DeviceId="%s", Version="%s"'
                % (deviceName, deviceId, version))
            header = {
                'Content-type': 'application/json',
                'Accept-encoding': 'gzip',
                'Accept-Charset': 'UTF-8,*',
                'Authorization': auth
            }
            self.logMsg("Header: %s" % header, 2)

        else:
            userId = self.userId
            token = self.token
            # Attached to the requests session
            auth = (
                'MediaBrowser UserId="%s", Client="Kodi", Device="%s", DeviceId="%s", Version="%s"'
                % (userId, deviceName, deviceId, version))
            header = {
                'Content-type': 'application/json',
                'Accept-encoding': 'gzip',
                'Accept-Charset': 'UTF-8,*',
                'Authorization': auth,
                'X-MediaBrowser-Token': token
            }
            self.logMsg("Header: %s" % header, 2)

        return header

コード例 #21

0

ファイルを表示

ファイル: IU_Chest_XRay.py プロジェクト: zbxzc35/medical_caption

    def caption_preprocess(self, report):
        caption = normalize_string(report)
        caption = [
            sent.strip() for sent in caption.split(' .')
            if len(sent.strip()) > 0
        ]

        sent_embds = []

        sent_length = []
        sent_num = min(len(caption), MAX_SENT)

        ## 截断过长的caption以及过长的sent
        for i_sent in range(MAX_SENT):

            if i_sent >= len(caption):
                sent_embds.append([self.word2idx['EOS']] * (MAX_WORDS + 1))
                sent_length.append(0)
                continue
            sent = caption[i_sent]
            words = sent.split()

            length = min(len(words), MAX_WORDS)
            sent_length.append(length)

            sent_embd = []

            for i_word in range(MAX_WORDS + 1):
                if i_word >= len(words):
                    sent_embd.append(self.word2idx['EOS'])
                else:
                    word = words[i_word]
                    sent_embd.append(
                        self.word2idx.get(word, self.word2idx['UNK']))

            sent_embds.append(sent_embd)

        return sent_embds, sent_num, sent_length

コード例 #22

0

ファイルを表示

 def get_custom_path(self, searchtitle, title):
     '''locate custom folder on disk as pvrart location'''
     title_path = ""
     custom_path = self.metadatautils.addon.getSetting("pvr_art_custom_path")
     if custom_path and self.metadatautils.addon.getSetting("pvr_art_custom") == "true":
         delim = "\\" if "\\" in custom_path else "/"
         dirs = xbmcvfs.listdir(custom_path)[0]
         for strictness in [1, 0.95, 0.9, 0.8]:
             if title_path:
                 break
             for directory in dirs:
                 if title_path:
                     break
                 directory = directory.decode("utf-8")
                 curpath = os.path.join(custom_path, directory) + delim
                 for item in [title, searchtitle]:
                     match = SM(None, item, directory).ratio()
                     if match >= strictness:
                         title_path = curpath
                         break
         if not title_path and self.metadatautils.addon.getSetting("pvr_art_download") == "true":
             title_path = os.path.join(custom_path, normalize_string(title)) + delim
     return title_path

コード例 #23

0

ファイルを表示

ファイル: colorthemes.py プロジェクト: camster1/RTOTV

    def create_colortheme(self):
        '''create a colortheme from current skin color settings'''
        try:
            current_skinfont = None
            json_response = kodi_json("Settings.GetSettingValue",
                                      {"setting": "lookandfeel.font"})
            if json_response:
                current_skinfont = json_response
            current_skincolors = None
            json_response = kodi_json("Settings.GetSettingValue",
                                      {"setting": "lookandfeel.skincolors"})
            if json_response:
                current_skincolors = json_response

            # user has to enter name for the theme
            themename = xbmcgui.Dialog().input(
                self.addon.getLocalizedString(32023),
                type=xbmcgui.INPUT_ALPHANUM).decode("utf-8")
            if not themename:
                return

            xbmc.executebuiltin("ActivateWindow(busydialog)")
            xbmc.executebuiltin(
                "Skin.SetString(SkinHelper.LastColorTheme,%s)" %
                themename.encode("utf-8"))

            # add screenshot
            custom_thumbnail = xbmcgui.Dialog().browse(
                2, self.addon.getLocalizedString(32024), 'files')

            if custom_thumbnail:
                xbmcvfs.copy(custom_thumbnail,
                             self.userthemes_path + themename + ".jpg")

            # read the guisettings file to get all skin settings
            from backuprestore import BackupRestore
            skinsettingslist = BackupRestore().get_skinsettings([
                "color", "opacity", "texture", "panel", "colour", "background",
                "image"
            ])
            newlist = []
            if skinsettingslist:
                newlist.append(("THEMENAME", themename))
                newlist.append(
                    ("DESCRIPTION", self.addon.getLocalizedString(32025)))
                newlist.append(
                    ("SKINTHEME", xbmc.getInfoLabel("Skin.CurrentTheme")))
                newlist.append(("SKINFONT", current_skinfont))
                newlist.append(("SKINCOLORS", current_skincolors))

                # look for any images in the skin settings and translate them so they can
                # be included in the theme backup
                for skinsetting in skinsettingslist:
                    setting_type = skinsetting[0]
                    setting_name = skinsetting[1]
                    setting_value = skinsetting[2]
                    if setting_type == "string" and setting_value:
                        if (setting_value
                                and (setting_value.endswith(".png")
                                     or setting_value.endswith(".gif")
                                     or setting_value.endswith(".jpg"))
                                and "resource://" not in setting_value):
                            image = get_clean_image(setting_value)
                            extension = image.split(".")[-1]
                            newimage = "%s_%s.%s" % (
                                themename, normalize_string(setting_name),
                                extension)
                            newimage_path = self.userthemes_path + newimage
                            if xbmcvfs.exists(image):
                                xbmcvfs.copy(image, newimage_path)
                                skinsetting = (setting_type, setting_name,
                                               newimage_path)
                    newlist.append(skinsetting)

                # save guisettings
                text_file_path = self.userthemes_path + themename + ".theme"
                text_file = xbmcvfs.File(text_file_path, "w")
                text_file.write(repr(newlist))
                text_file.close()
                xbmc.executebuiltin("Dialog.Close(busydialog)")
                xbmcgui.Dialog().ok(self.addon.getLocalizedString(32026),
                                    self.addon.getLocalizedString(32027))
        except Exception as exc:
            xbmc.executebuiltin("Dialog.Close(busydialog)")
            log_exception(__name__, exc)
            xbmcgui.Dialog().ok(self.addon.getLocalizedString(32028),
                                self.addon.getLocalizedString(32030), str(exc))

コード例 #24

0

ファイルを表示

ファイル: data_cleaning.py プロジェクト: rafaelcgama/restore-uber

 def is_former(position):
     status = False
     if 'uber' not in normalize_string(
             position) or 'ex-uber' in normalize_string(position):
         status = True
     return status

コード例 #25

0

ファイルを表示

def seq2words(sequences_list):
    normaliszed_seq_list = [utils.normalize_string(seq) for seq in sequences_list]
    token_seq_list = [seq.split() for seq in normaliszed_seq_list]
    return token_seq_list

コード例 #26

0

ファイルを表示

def main():
    user_input = 'Who cares?'
    sentence = utils.normalize_string(user_input)
    output_words, decoder_attn = evaluate(sentence)
    output_sentence = ' '.join(output_words)
    print("Sentence: {}\nTranslated Sentence: {}".format(user_input, output_sentence))

コード例 #27

0

ファイルを表示

ファイル: data_cleaning.py プロジェクト: rafaelcgama/restore-uber

 def is_driver(position):
     status = False
     if 'driver' in normalize_string(
             position) or 'motorista' in normalize_string(position):
         status = True
     return status

コード例 #28

0

ファイルを表示

ファイル: generate_lm.py プロジェクト: pervrosen/FineMerge

def main():

    parser = argparse.ArgumentParser(
        description="Generate binary n-gram lm file given a text file"
    )
    parser.add_argument(
        "--text_file",
        help="Path to text file to train the lm from",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--order",
        help="order of lm to train",
        type=int,
        required=False,
        default=3,
    )
    parser.add_argument(
        "--exclude_text",
        help="Path to text file whose sentences must be excluded from training",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--kenlm_dir",
        help="Path to the kenlm directory required for training n-gram lm",
        type=str,
        required=False,
        default='~/exp/kenlm',
    )
    parser.add_argument(
        "--labels",
        help="Path to char level tokens for parsing",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--lm_path",
        help="Path to the save the trained lm to",
        type=str,
        required=True,
    )
    args = parser.parse_args()

    print("Preparing text for training the lm....")

    with open(args.labels) as label_file:
        labels = json.load(label_file)

    with open(args.text_file) as fd:
        lm_text = fd.read().splitlines()
        lm_text = set([normalize_string(sentence, labels[1:]) for sentence in lm_text])

    with open(args.exclude_text) as fd:
        text_to_exclude = fd.read().splitlines()
        text_to_exclude = set([normalize_string(sentence, labels[1:]) for \
            sentence in text_to_exclude])

    lm_text_final = lm_text - text_to_exclude
    with open('lm_text.txt', 'w') as fd:
        fd.write('\n'.join(lm_text_final))

    print('Build the arpa lm file of order {} ....'.format(args.order))

    command = '{} -o {} < lm_text.txt > lm.arpa'.format(
        os.path.join(args.kenlm_dir, 'build/bin/lmplz'),
        args.order)
    process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    process.communicate()

    command = '{} lm.arpa {}'.format(
        os.path.join(args.kenlm_dir, 'build/bin/build_binary'),
        args.lm_path)
    process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    process.communicate()

    os.remove('lm_text.txt')
    os.remove('lm.arpa')

コード例 #29

0

ファイルを表示

 def check(self, name, release_year):
     years = self.name_to_years[utils.normalize_string(name)]
     for year in years:
         if int(release_year) > year:
             return True
     return False

コード例 #30

0

ファイルを表示

 def __init__(self):
     df = pandas.read_csv(pathmng.wiki_best_actor_director_path)
     for index, row in df.iterrows():
         self.name_to_years[utils.normalize_string(row["name"])].add(row.year)

コード例 #31

0

ファイルを表示

import config
import pathmng
import utils
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from crawl.imdb import lazyCrawl

rotten_path = "file:///" + config.crawl_data_path + "\\rotten.csv"
cleaned_rotten_path = "file:///" + config.crawl_data_path + "\\rotten_cleaned\\"

imdb_path = "file:///" + config.crawl_data_path + "\\imdb.csv"
cleaned_imdb_path = "file:///" + config.crawl_data_path + "\\imdb_cleaned.csv"

normalize_title_func = udf(lambda x: utils.normalize_string(x))


class AwardsCheckExist:

    name_to_years = defaultdict(lambda: set())
    _singleton = None

    def __init__(self):
        df = pandas.read_csv(pathmng.wiki_best_actor_director_path)
        for index, row in df.iterrows():
            self.name_to_years[utils.normalize_string(row["name"])].add(row.year)

    def check(self, name, release_year):
        years = self.name_to_years[utils.normalize_string(name)]
        for year in years:

コード例 #32

0

ファイルを表示

def getThemeMedia():

    doUtils = downloadutils.DownloadUtils()
    dialog = xbmcgui.Dialog()
    playback = None

    # Choose playback method
    resp = dialog.select(lang(33072), [lang(30165), lang(33071)])
    if resp == 0:
        playback = "DirectPlay"
    elif resp == 1:
        playback = "DirectStream"
    else:
        return

    library = xbmc.translatePath(
                "special://profile/addon_data/emby.for.kodi/library/").decode('utf-8')
    # Create library directory
    if not xbmcvfs.exists(library):
        xbmcvfs.mkdir(library)

    # Set custom path for user
    if xbmc.getCondVisibility('System.HasAddon(script.tvtunes)'):
        tvtunes = xbmcaddon.Addon(id="script.tvtunes")
        tvtunes.setSetting('custom_path_enable', "true")
        tvtunes.setSetting('custom_path', library)
        log.info("TV Tunes custom path is enabled and set.")
    else:
        # if it does not exist this will not work so warn user
        # often they need to edit the settings first for it to be created.
        dialog.ok(heading=lang(29999), line1=lang(33073))
        xbmc.executebuiltin('Addon.OpenSettings(script.tvtunes)')
        return
        
    # Get every user view Id
    with database.DatabaseConn('emby') as cursor:
        emby_db = embydb.Embydb_Functions(cursor)
        viewids = emby_db.getViews()

    # Get Ids with Theme Videos
    itemIds = {}
    for view in viewids:
        url = "{server}/emby/Users/{UserId}/Items?HasThemeVideo=True&ParentId=%s&format=json" % view
        result = doUtils.downloadUrl(url)
        if result['TotalRecordCount'] != 0:
            for item in result['Items']:
                itemId = item['Id']
                folderName = item['Name']
                folderName = utils.normalize_string(folderName.encode('utf-8'))
                itemIds[itemId] = folderName

    # Get paths for theme videos
    for itemId in itemIds:
        nfo_path = xbmc.translatePath(
            "special://profile/addon_data/emby.for.kodi/library/%s/" % itemIds[itemId])
        # Create folders for each content
        if not xbmcvfs.exists(nfo_path):
            xbmcvfs.mkdir(nfo_path)
        # Where to put the nfos
        nfo_path = "%s%s" % (nfo_path, "tvtunes.nfo")

        url = "{server}/emby/Items/%s/ThemeVideos?format=json" % itemId
        result = doUtils.downloadUrl(url)

        # Create nfo and write themes to it
        nfo_file = xbmcvfs.File(nfo_path, 'w')
        pathstowrite = ""
        # May be more than one theme
        for theme in result['Items']:
            putils = playutils.PlayUtils(theme)
            if playback == "DirectPlay":
                playurl = putils.directPlay()
            else:
                playurl = putils.directStream()
            pathstowrite += ('<file>%s</file>' % playurl.encode('utf-8'))
        
        # Check if the item has theme songs and add them   
        url = "{server}/emby/Items/%s/ThemeSongs?format=json" % itemId
        result = doUtils.downloadUrl(url)

        # May be more than one theme
        for theme in result['Items']: 
            if playback == "DirectPlay":
                playurl = api.API(theme).get_file_path()
            else:
                playurl = playutils.PlayUtils(theme).directStream()
            pathstowrite += ('<file>%s</file>' % playurl.encode('utf-8'))

        nfo_file.write(
            '<tvtunes>%s</tvtunes>' % pathstowrite
        )
        # Close nfo file
        nfo_file.close()

    # Get Ids with Theme songs
    musicitemIds = {}
    for view in viewids:
        url = "{server}/emby/Users/{UserId}/Items?HasThemeSong=True&ParentId=%s&format=json" % view
        result = doUtils.downloadUrl(url)
        if result['TotalRecordCount'] != 0:
            for item in result['Items']:
                itemId = item['Id']
                folderName = item['Name']
                folderName = utils.normalize_string(folderName.encode('utf-8'))
                musicitemIds[itemId] = folderName

    # Get paths
    for itemId in musicitemIds:
        
        # if the item was already processed with video themes back out
        if itemId in itemIds:
            continue
        
        nfo_path = xbmc.translatePath(
            "special://profile/addon_data/emby.for.kodi/library/%s/" % musicitemIds[itemId])
        # Create folders for each content
        if not xbmcvfs.exists(nfo_path):
            xbmcvfs.mkdir(nfo_path)
        # Where to put the nfos
        nfo_path = "%s%s" % (nfo_path, "tvtunes.nfo")
        
        url = "{server}/emby/Items/%s/ThemeSongs?format=json" % itemId
        result = doUtils.downloadUrl(url)

        # Create nfo and write themes to it
        nfo_file = xbmcvfs.File(nfo_path, 'w')
        pathstowrite = ""
        # May be more than one theme
        for theme in result['Items']: 
            if playback == "DirectPlay":
                playurl = api.API(theme).get_file_path()
            else:
                playurl = playutils.PlayUtils(theme).directStream()
            pathstowrite += ('<file>%s</file>' % playurl.encode('utf-8'))

        nfo_file.write(
            '<tvtunes>%s</tvtunes>' % pathstowrite
        )
        # Close nfo file
        nfo_file.close()

    dialog.notification(
            heading=lang(29999),
            message=lang(33069),
            icon="special://home/addons/emby.for.kodi/icon.png",
            time=1000,
            sound=False)

コード例 #33

0

ファイルを表示

def getThemeMedia():

    doUtils = downloadutils.DownloadUtils()
    dialog = xbmcgui.Dialog()
    playback = None

    # Choose playback method
    resp = dialog.select("Playback method for your themes", ["Direct Play", "Direct Stream"])
    if resp == 0:
        playback = "DirectPlay"
    elif resp == 1:
        playback = "DirectStream"
    else:
        return

    library = xbmc.translatePath(
                "special://profile/addon_data/plugin.video.emby/library/").decode('utf-8')
    # Create library directory
    if not xbmcvfs.exists(library):
        xbmcvfs.mkdir(library)

    # Set custom path for user
    tvtunes_path = xbmc.translatePath(
        "special://profile/addon_data/script.tvtunes/").decode('utf-8')
    if xbmcvfs.exists(tvtunes_path):
        tvtunes = xbmcaddon.Addon(id="script.tvtunes")
        tvtunes.setSetting('custom_path_enable', "true")
        tvtunes.setSetting('custom_path', library)
        utils.logMsg("EMBY", "TV Tunes custom path is enabled and set.", 1)
    else:
        # if it does not exist this will not work so warn user
        # often they need to edit the settings first for it to be created.
        dialog.ok(
            heading="Warning",
            line1=(
                "The settings file does not exist in tvtunes. ",
                "Go to the tvtunes addon and change a setting, then come back and re-run."))
        xbmc.executebuiltin('Addon.OpenSettings(script.tvtunes)')
        return
        
    # Get every user view Id
    embyconn = utils.kodiSQL('emby')
    embycursor = embyconn.cursor()
    emby_db = embydb.Embydb_Functions(embycursor)
    viewids = emby_db.getViews()
    embycursor.close()

    # Get Ids with Theme Videos
    itemIds = {}
    for view in viewids:
        url = "{server}/emby/Users/{UserId}/Items?HasThemeVideo=True&ParentId=%s&format=json" % view
        result = doUtils.downloadUrl(url)
        if result['TotalRecordCount'] != 0:
            for item in result['Items']:
                itemId = item['Id']
                folderName = item['Name']
                folderName = utils.normalize_string(folderName.encode('utf-8'))
                itemIds[itemId] = folderName

    # Get paths for theme videos
    for itemId in itemIds:
        nfo_path = xbmc.translatePath(
            "special://profile/addon_data/plugin.video.emby/library/%s/" % itemIds[itemId])
        # Create folders for each content
        if not xbmcvfs.exists(nfo_path):
            xbmcvfs.mkdir(nfo_path)
        # Where to put the nfos
        nfo_path = "%s%s" % (nfo_path, "tvtunes.nfo")

        url = "{server}/emby/Items/%s/ThemeVideos?format=json" % itemId
        result = doUtils.downloadUrl(url)

        # Create nfo and write themes to it
        nfo_file = xbmcvfs.File(nfo_path, 'w')
        pathstowrite = ""
        # May be more than one theme
        for theme in result['Items']:
            putils = playutils.PlayUtils(theme)
            if playback == "DirectPlay":
                playurl = putils.directPlay()
            else:
                playurl = putils.directStream()
            pathstowrite += ('<file>%s</file>' % playurl.encode('utf-8'))
        
        # Check if the item has theme songs and add them   
        url = "{server}/emby/Items/%s/ThemeSongs?format=json" % itemId
        result = doUtils.downloadUrl(url)

        # May be more than one theme
        for theme in result['Items']:
            putils = playutils.PlayUtils(theme)  
            if playback == "DirectPlay":
                playurl = putils.directPlay()
            else:
                playurl = putils.directStream()
            pathstowrite += ('<file>%s</file>' % playurl.encode('utf-8'))

        nfo_file.write(
            '<tvtunes>%s</tvtunes>' % pathstowrite
        )
        # Close nfo file
        nfo_file.close()

    # Get Ids with Theme songs
    musicitemIds = {}
    for view in viewids:
        url = "{server}/emby/Users/{UserId}/Items?HasThemeSong=True&ParentId=%s&format=json" % view
        result = doUtils.downloadUrl(url)
        if result['TotalRecordCount'] != 0:
            for item in result['Items']:
                itemId = item['Id']
                folderName = item['Name']
                folderName = utils.normalize_string(folderName.encode('utf-8'))
                musicitemIds[itemId] = folderName

    # Get paths
    for itemId in musicitemIds:
        
        # if the item was already processed with video themes back out
        if itemId in itemIds:
            continue
        
        nfo_path = xbmc.translatePath(
            "special://profile/addon_data/plugin.video.emby/library/%s/" % musicitemIds[itemId])
        # Create folders for each content
        if not xbmcvfs.exists(nfo_path):
            xbmcvfs.mkdir(nfo_path)
        # Where to put the nfos
        nfo_path = "%s%s" % (nfo_path, "tvtunes.nfo")
        
        url = "{server}/emby/Items/%s/ThemeSongs?format=json" % itemId
        result = doUtils.downloadUrl(url)

        # Create nfo and write themes to it
        nfo_file = xbmcvfs.File(nfo_path, 'w')
        pathstowrite = ""
        # May be more than one theme
        for theme in result['Items']: 
            putils = playutils.PlayUtils(theme)
            if playback == "DirectPlay":
                playurl = putils.directPlay()
            else:
                playurl = putils.directStream()
            pathstowrite += ('<file>%s</file>' % playurl.encode('utf-8'))

        nfo_file.write(
            '<tvtunes>%s</tvtunes>' % pathstowrite
        )
        # Close nfo file
        nfo_file.close()

    dialog.notification(
            heading="Emby for Kodi",
            message="Themes added!",
            icon="special://home/addons/plugin.video.emby/icon.png",
            time=1000,
            sound=False)

コード例 #34

0

ファイルを表示

ファイル: data_loader.py プロジェクト: ashishbaghudana/TextSummarization

 def _read(self, filename):
     with open(filename) as fp:
         content = fp.read()
     return normalize_string(content)