def test_provision_from_template(provider, testing_instance, soft_assert): """ Tests instance provision from template Metadata: test_flag: provision """ instance, inst_args, image = testing_instance instance.create(**inst_args) logger.info('Waiting for cfme provision request for vm %s', instance.name) row_description = 'Provision from [{}] to [{}]'.format(image, instance.name) cells = {'Description': row_description} try: row, __ = wait_for(requests.wait_for_request, [cells], fail_func=requests.reload, num_sec=1500, delay=20) except Exception as e: requests.debug_requests() raise e assert normalize_text(row.status.text) == 'ok' and \ normalize_text(row.request_state.text) == 'finished', \ "Provisioning failed with the message {}".format(row.last_message.text) instance.wait_to_appear(timeout=800) provider.refresh_provider_relationships() logger.info("Refreshing provider relationships and power states") refresh_timer = RefreshTimer(time_for_refresh=300) wait_for(provider.is_refreshed, [refresh_timer], message="is_refreshed", num_sec=1000, delay=60, handle_exception=True) soft_assert(instance.does_vm_exist_on_provider(), "Instance wasn't provisioned")
def test_provision_from_template(request, setup_provider, provider, testing_instance, soft_assert): """ Tests instance provision from template Metadata: test_flag: provision """ instance, inst_args, image = testing_instance instance.create(**inst_args) logger.info('Waiting for cfme provision request for vm %s', instance.name) row_description = 'Provision from [{}] to [{}]'.format( image, instance.name) cells = {'Description': row_description} try: row, __ = wait_for(requests.wait_for_request, [cells], fail_func=requests.reload, num_sec=1500, delay=20) except Exception as e: requests.debug_requests() raise e assert normalize_text(row.status.text) == 'ok' and \ normalize_text(row.request_state.text) == 'finished', \ "Provisioning failed with the message {}".format(row.last_message.text) instance.wait_to_appear(timeout=800) provider.refresh_provider_relationships() logger.info("Refreshing provider relationships and power states") refresh_timer = RefreshTimer(time_for_refresh=300) wait_for(provider.is_refreshed, [refresh_timer], message="is_refreshed", num_sec=1000, delay=60, handle_exception=True) soft_assert(instance.does_vm_exist_on_provider(), "Instance wasn't provisioned")
def verify(): return ( len(filter( lambda mail: "your virtual machine request has completed vm {}".format(normalize_text(vm_name)) in normalize_text(mail["subject"]), smtp_test.get_emails())) == len(vm_names) )
def input_load(mode="train"): """ Load the input text and the corresponding feature labels :param mode: whether to gather data for training and evaluation or for synthesis :return: the text labels, the text lengths, and the audio file paths """ # creates vocab conversion dictionaries char2idx, _ = create_vocab() fpaths, text_lengths, texts = [], [], [] # the path to the dataset base_path = os.path.join(DATA_PATH, 'wavs') # the path to the text transcript = os.path.join(DATA_PATH, 'metadata.csv') # training or evaluation if mode in ("train", "eval"): # Each epoch for _ in range(NUM_EPOCHS): # open the text file lines = codecs.open(transcript, 'r', ENCODING).readlines() for line in lines: fname, _, text = line.strip().split("|") # get the wav file paths fpath = os.path.join(base_path, fname + ".wav") fpaths.append(fpath) # clean and normalize the text text = normalize_text(text) + "$" # E: EOS text = [char2idx[char] for char in text] text_lengths.append(len(text)) texts.append(np.array(text, np.int32).tostring()) return fpaths, text_lengths, texts else: # synthesis # Parse lines = codecs.open(TEST_DATA, 'r', 'utf-8').readlines()[1:] # Normalize text: $ is EOS sents = [ normalize_text(line.split(" ", 1)[-1]).strip() + "$" for line in lines ] lengths = [len(sent) for sent in sents] maxlen = sorted(lengths, reverse=True)[0] # Pad the text texts = np.zeros((len(sents), maxlen), np.int32) for i, sent in enumerate(sents): texts[i, :len(sent)] = [char2idx[char] for char in sent] # return just the text, no lengths or paths needed return texts
def do_vm_provisioning(template_name, provider, vm_name, provisioning_data, request, smtp_test, num_sec=1500, wait=True): # generate_tests makes sure these have values sel.force_navigate('infrastructure_provision_vms', context={ 'provider': provider, 'template_name': template_name, }) note = ('template {} to vm {} on provider {}'.format(template_name, vm_name, provider.key)) provisioning_data.update({ 'email': '*****@*****.**', 'first_name': 'Template', 'last_name': 'Provisioner', 'notes': note, }) fill(provisioning_form, provisioning_data, action=provisioning_form.submit_button) flash.assert_no_errors() if not wait: return # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup logger.info('Waiting for vm %s to appear on provider %s', vm_name, provider.key) wait_for(provider.mgmt.does_vm_exist, [vm_name], handle_exception=True, num_sec=600) # nav to requests page happens on successful provision logger.info('Waiting for cfme provision request for vm %s', vm_name) row_description = 'Provision from [{}] to [{}]'.format(template_name, vm_name) cells = {'Description': row_description} try: row, __ = wait_for(requests.wait_for_request, [cells], fail_func=requests.reload, num_sec=num_sec, delay=20) except Exception as e: requests.debug_requests() raise e assert normalize_text(row.status.text) == 'ok' \ and normalize_text( row.request_state.text) == 'finished' if smtp_test: # Wait for e-mails to appear def verify(): if current_version() >= "5.4": approval = dict(subject_like="%%Your Virtual Machine configuration was Approved%%") else: approval = dict(text_like="%%Your Virtual Machine Request was approved%%") expected_text = "Your virtual machine request has Completed - VM:%%{}".format(vm_name) return ( len(smtp_test.get_emails(**approval)) > 0 and len(smtp_test.get_emails(subject_like=expected_text)) > 0 ) wait_for(verify, message="email receive check", delay=5)
def load_dataset(self, dataset, columns=None, drop_nan=True, reset_data=True): self.vprint('loading dataset {0}'.format( dataset if isinstance(dataset, str) else 'from pandas DataFrame')) if isinstance(dataset, str): dataset = pd.read_csv( dataset, header=0 ) # read csv assuming first line has header text. TODO handle files w/o headers else: assert isinstance(dataset, pd.DataFrame) headers = dataset.columns.values if columns: text_df = dataset[columns] else: text_df = dataset.select_dtypes([ 'object' ]) # drop non-text rows (pandas strings are of type 'object') # TODO confirm that the columns selected can't be cast to a numeric type to avoid numeric strings (e.g. '1') dtype_dropped = get_dropped(headers, text_df.columns.values) self.vprint('\ndropped non-text columns: {0}'.format( list(dtype_dropped))) if drop_nan: # drop columns if there are any missing values # TODO handle missing values w/o dropping whole column text_df = text_df.dropna(axis=1, how='any') nan_dropped = get_dropped(headers, text_df.columns.values) nan_dropped = nan_dropped.difference(dtype_dropped) if nan_dropped: self.vprint( '\ndropped columns with missing values: {0}'.format( list(nan_dropped))) if not reset_data: # TODO implement variant where data is appended instead of overwritten raise Exception('not implemented') self.data = {} self.vprint('\nnormalizing headers') self.data['headers'] = self.format_data(headers) for col in text_df.columns.values: self.vprint('\nnormalizing column: {0}'.format( normalize_text(col, to_list=False))) self.data[normalize_text(col, to_list=False)] = self.format_data( text_df[col].values) return self.data
def __get_actual_affiliations(affiliations, author_affiliation): actual_affiliations = [] author_affiliation = author_affiliation.strip() for affiliation in affiliations: affiliation = affiliation.strip() # normalize text before comparison affiliation_nor = normalize_text(affiliation) author_affiliation_nor = normalize_text(author_affiliation) if affiliation_nor.lower() in author_affiliation_nor.lower(): if affiliation: actual_affiliations.append(affiliation.strip()) return actual_affiliations
def get_word_dependent_features(dataset): features = [] for key, text in tqdm(dataset.items()): feature = {} normalized_text = utils.normalize_text(text) words, _ = utils.tokenize_word_and_sentence(normalized_text) C = len(normalized_text) N = len(words) # #ویژگی های مبتنی بر واژه # تعداد کل کلمات feature['WD_F7'] = N #میانگین تعداد نویسه در هر کلمه feature['WD_F8'] = C / N # غنای واژگانی (کل کلمات یکتا تقسیم بر تعداد کل کلمه ها) V = set(words) feature['WD_F9'] = len(V) / N #N/ کلمات طولانی (بزرگ تر از 3 نویسه) long_words = [w for w in words if len(w) >= 3] feature['WD_F10'] = len(long_words) / N #N/کلمات کوچک تر از 2 نویسه short_words = [w for w in words if len(w) <= 2] feature['WD_F11'] = len(short_words) / N #N/کلمات 1 تکراره counts = Counter(words) unique_words = [w for w in words if counts[w] == 1] feature['WD_F12'] = len(unique_words) #N/کلمات 2 تکراره double_words = [w for w in words if counts[w] == 2] feature['WD_F13'] = len(double_words) # معیار k یول yules_k = 10000 * (-1 * (1.0 / N) + sum( list([(len(list(w for w in V if counts[w] == i))) * ((i / N)**2) for i in range(1, len(V) + 1)]))) feature['WD_F14'] = yules_k # معیار D سیمپسون simpsons_d = sum((len(list(w for w in V if counts[w] == i))) * (i / N) * ((i - 1) / (N - 1)) for i in range(1, len(V))) feature['WD_F15'] = simpsons_d # معیار S سیشل sichels_s = len(double_words) / len(V) feature['WD_F16'] = sichels_s # معیار R هونور delimiter = 1 - len(unique_words) / len(V) if delimiter == 0: delimiter = 0.0001 honores_R = (100 * math.log(N)) / (delimiter) feature['WD_F17'] = honores_R #معیار انتروپی entorpy = sum((len(list(w for w in V if counts[w] == i))) * (i / N) * (-1 * math.log(i / N)) for i in range(1, len(V))) feature['WD_F50'] = entorpy feature['number'] = key features.append(feature) return pd.DataFrame(features)
def load_embeddings(vocab,filepath,embedding_size): ''' input: vocab: the vocab of squad filepath: glove file path embedding_size: embedding dim return: embeddings: the whole embedding fixed_embeddings: the fixed embedding ''' embeddings = np.random.normal(0.00,1.00,[len(vocab),embedding_size]) count = 0 with open(filepath,"r",encoding="utf8") as f: for line in f: word = line.rstrip().split(' ')[0] word = utils.normalize_text(word) if(word in vocab): count += 1 vec = line.strip().split(" ")[1:] vec = np.array(vec) embeddings[vocab[word]] = vec embeddings[vocab["__NULL__"]] = np.zeros(embedding_size) fixed_embeddings = embeddings[FIXED_EMBEDDING_NUM+3:] print('the glove count:',count) return embeddings,fixed_embeddings
def get_structural_features(dataset): features = [] for key, text in tqdm(dataset.items()): feature = {} normalized_text = utils.normalize_text(text) words, sentences = utils.tokenize_word_and_sentence(normalized_text) N = len(words) S = len(sentences) # # ویژگی های ساختاری # # تعداد کل خط feature['STR_F25'] = text.count('\n') # تعداد کل جملات = S feature['STR_F26'] = S # تعداد کلمه در هر جمله (میانگین) feature['STR_F27'] = N/S empty_lines = text.replace(" ", "").count('\n\n') # کل خطوط / تعداد خطوط خالی total_lines = text.replace(" " , "").count('\n') + 1 feature['STR_F28'] = empty_lines/total_lines # میانگین خطوط غیر خالی feature['STR_F29'] = len(text)/(total_lines-empty_lines) feature['number'] = key features.append(feature) return pd.DataFrame(features)
def get_text_dependent_features(dataset): features = [] for key, text in tqdm(dataset.items()): feature = {} normalized_text = utils.normalize_text(text) # words, _ = utils.tokenize_word_and_sentence(normalized_text) alphabets_in_texts = [i for i in Counter(normalized_text) if i in config.alphabet] C = len(normalized_text) # N = len(words) feature['TD_F1'] = C # تعداد کل حروف الفبا / C feature['TD_F2'] = len(alphabets_in_texts)/C # # تعداد حروف الفبا feature['TD_F49'] = len(alphabets_in_texts) # تعداد کل اعداد feature['TD_F3'] = utils.count_chars(normalized_text, config.numbers) # تعداد نویسه فاصله feature['TD_F4'] = text.count(' ')/C # تعداد نویسه تب feature['TD_F5'] = text.count('\t')/C # تعداد نویسه ویژه feature['TD_F6'] = utils.count_chars(normalized_text , config.special_chars)/C feature['number'] = key features.append(feature) return pd.DataFrame(features)
def get_syntactics_features(dataset): features = [] for key, text in tqdm(dataset.items()): feature = {} normalized_text = utils.normalize_text(text) words, _ = utils.tokenize_word_and_sentence(normalized_text) C = len(normalized_text) # # ویژگی های نحوی # C/تعداد کاما feature['SYN_F18'] = text.count('،') / C # C/تعداد نقطه feature['SYN_F19'] = text.count('.') / C # C/تعداد دو نقطه feature['SYN_F20'] = text.count(':') / C # C/تعداد سمیکلون feature['SYN_F21'] = text.count(';') / C # C/تعداد علامت سوال feature['SYN_F22'] = text.count('؟') / C # C/تعداد علامت تعجب feature['SYN_F23'] = text.count('!') / C # C/تعداد علامت تعجب سه تایی feature['SYN_F24'] = text.count('!!!') / C feature['number'] = key features.append(feature) return pd.DataFrame(features)
def __getitem__(self, idx): words, tags = self.sents[idx], self.tags_li[idx] x, y = [], [] is_heads = [] for w, t in zip(words, tags): w = normalize_text(w) tokens = self.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w] xx = self.tokenizer.convert_tokens_to_ids(tokens) # assert len(tokens) == len(xx), f"len(tokens)={len(tokens)}, len(xx)={len(xx)}" # 中文没有英文wordpiece后分成几块的情况 is_head = [1] + [0] * (len(xx) - 1) # t = [t] + ['<PAD>'] * (len(tokens) - 1) yy = [self.tag2idx[t]] # (T,) x.extend(xx) is_heads.extend(is_head) y.extend(yy) assert len(x) == len(y) == len( is_heads ), f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}" # seqlen seqlen = len(y) # to string words = " ".join(words) tags = " ".join(tags) return words, x, is_heads, tags, y, seqlen
def dl_model_classify(self, text, normalize=True): """ classify the given text to the appropriate classes "multi class for each text using the deep learning model" Args: text (str): represents text that we want to classify. normalize(bool): represents a flag if it true we will normalise the text before pass it to the model. Default True. Returns: list of 11 items "class" contains 0's and 1's, 1 means you could map the text to this class and 0 means that you couldn't map the text to this class. [ 'فن ومشاهير', 'أخبار', 'رياضة', 'اقتصاد', 'تكنولوجيا', 'اسلام و أديان', 'سيارات', 'طقس', 'منوعات أخرى', 'صحة', 'مطبخ' ] """ if normalize: text = [normalize_text(text)] embedded_text = self.dl_vectorizer.embed_batch(text, MAX_SENTENCE_LENGTH) predictions = (self.dl_model.predict(embedded_text) > 0.5).astype(int) return predictions.tolist()[0]
async def parse_facts_component(html_content): meta_data = {} elem = html_content.find("ol", class_="chart") meta_data[html_content.find("strong").text.strip()] = [ i.text.strip() for i in elem.findAll("li") ] elem_2 = html_content.findAll("div", class_="form-group")[1] meta_data[elem_2.find("strong").text.strip()] = [ i.text.strip() for i in elem_2.findAll("li") ] text = "" for k, v in meta_data.items(): k = normalize_text(k) text += f"<{k}>" for i in v: if not i: continue text += "<value>" text += f"{i}" text += "</value>" text += f"</{k}>" return prettify(text, "facts")
def do_vm_provisioning( template_name, provider, vm_name, provisioning_data, request, smtp_test, num_sec=1500, wait=True ): # generate_tests makes sure these have values vm = Vm(name=vm_name, provider=provider, template_name=template_name) navigate_to(vm, "ProvisionVM") note = "template {} to vm {} on provider {}".format(template_name, vm_name, provider.key) provisioning_data.update( { "email": "*****@*****.**", "first_name": "Template", "last_name": "Provisioner", "notes": note, } ) fill(provisioning_form, provisioning_data, action=provisioning_form.submit_button) flash.assert_no_errors() if not wait: return # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup logger.info("Waiting for vm %s to appear on provider %s", vm_name, provider.key) wait_for(provider.mgmt.does_vm_exist, [vm_name], handle_exception=True, num_sec=600) # nav to requests page happens on successful provision logger.info("Waiting for cfme provision request for vm %s", vm_name) row_description = "Provision from [{}] to [{}]".format(template_name, vm_name) cells = {"Description": row_description} try: row, __ = wait_for(requests.wait_for_request, [cells], fail_func=requests.reload, num_sec=num_sec, delay=20) except Exception as e: requests.debug_requests() raise e assert normalize_text(row.status.text) == "ok" and normalize_text(row.request_state.text) == "finished" if smtp_test: # Wait for e-mails to appear def verify(): approval = dict(subject_like="%%Your Virtual Machine configuration was Approved%%") expected_text = "Your virtual machine request has Completed - VM:%%{}".format(vm_name) return ( len(smtp_test.get_emails(**approval)) > 0 and len(smtp_test.get_emails(subject_like=expected_text)) > 0 ) wait_for(verify, message="email receive check", delay=5)
def __affiliations_to_save(affiliations, new_affiliations): jarowinkler = JaroWinkler() similarity_threshold = 0.95 affiliations_to_save = [] for new_affiliation in new_affiliations: exist_affiliation = False for affiliation in affiliations: # normalize text before comparison affiliation_nor = normalize_text(affiliation) new_affiliation_nor = normalize_text(new_affiliation) similarity_score = jarowinkler.similarity( affiliation_nor.lower(), new_affiliation_nor.lower()) if similarity_score >= similarity_threshold: exist_affiliation = True if not exist_affiliation: affiliations_to_save.append(new_affiliation) return affiliations_to_save
def load_glovedict(filepath): embedding_words = set() with open(filepath,"r",encoding="utf8") as f: for line in f: w = utils.normalize_text(line.rstrip().split(' ')[0]) embedding_words.add(w) print('Num words in glove dict is',len(embedding_words)) return embedding_words
def disambiguate_objs_by_name(objs, ref_name): similarity = [] for obj in objs: nor_name = utils.normalize_text(obj.name) names = itertools.izip_longest(nor_name, ref_name) similarity.append(len([c1 for c1, c2 in names if c1 == c2])) idx_max_sim = similarity.index(max(similarity)) return objs[idx_max_sim]
def get_sentences(dataset): sentences = [] for text in dataset.values(): normalized_text = utils.normalize_text(text) words, _ = utils.tokenize_word_and_sentence(normalized_text, include_special_chars=True) sentences.append(words) return sentences
def extract_keywords(text, language): normalized_text = utils.normalize_text(text) sentences_list = utils.split_sentence(normalized_text) stopwords_regex = __get_stopwords_regex(language) keywords_candidate = __extract_keywords_candidate(sentences_list, stopwords_regex) word_scores = __calculate_word_scores(keywords_candidate) keywords_scores = __calculate_keyword_scores(keywords_candidate, word_scores) keywords_selected = __selected_keywords(keywords_scores) return keywords_selected
def filedump(self, description, contents, slaveid=None, mode="w", contents_base64=False, display_type="primary", display_glyph=None, file_type=None, dont_write=False, os_filename=None, group_id=None, test_name=None, test_location=None): if slaveid is not None: if not slaveid: slaveid = "Master" test_ident = "{}/{}".format(self.store[slaveid]['test_location'], self.store[slaveid]['test_name']) else: test_ident = "{}/{}".format(test_location, test_name) artifacts = [] if os_filename is None: safe_name = re.sub(r"\s+", "_", normalize_text(safe_string(description))) os_filename = self.ident + "-" + safe_name os_filename = os.path.join(self.store[slaveid]['artifact_path'], os_filename) if file_type is not None and "screenshot" in file_type: os_filename = os_filename + ".png" elif file_type is not None and ("_tb" in file_type or "traceback" in file_type or file_type == "log"): os_filename = os_filename + ".log" elif file_type is not None and file_type == "html": os_filename = os_filename + ".html" elif file_type is not None and file_type == "video": os_filename = os_filename + ".ogv" else: os_filename = os_filename + ".txt" artifacts.append({ "file_type": file_type, "display_type": display_type, "display_glyph": display_glyph, "description": description, "os_filename": os_filename, "group_id": group_id, }) if not dont_write: if os.path.isfile(os_filename): os.remove(os_filename) with open(os_filename, mode) as f: if contents_base64: contents = base64.b64decode(contents) f.write(contents) return None, {'artifacts': {test_ident: {'files': artifacts}}}
def get_grammatical_features(dataset): features = [] for key, text in tqdm(dataset.items()): feature = {} normalized_text = utils.normalize_text(text) tagger = hazm.POSTagger(model=data_path.POSTAGGER_MODEL_PATH) tags = tagger.tag(hazm.word_tokenize(normalized_text)) tags_list = [i[1] for i in tags] sounds = [utils.normalize_text(sound) for sound in config.sounds] group_pros = [ utils.normalize_text(group_pro) for group_pro in config.group_pro ] conjunctions = [ utils.normalize_text(conjunction) for conjunction in config.conjunctions ] subjective_pronounces = [ utils.normalize_text(subjective_pronounce) for subjective_pronounce in config.subjective_pronounce ] feature['GRM_F30'] = utils.count_chars(text, subjective_pronounces) feature['GRM_F31'] = utils.count_chars(text, config.question) feature['GRM_F32'] = utils.count_chars(text, conjunctions) feature['GRM_F33'] = utils.count_chars(text, group_pros) feature['GRM_F34'] = utils.count_chars(text, sounds) feature['GRM_F35'] = tags_list.count('P') + tags_list.count('POSTP') feature['GRM_F40'] = tags_list.count('AJ') feature['GRM_F41'] = tags_list.count('ADV') feature['GRM_F42'] = tags_list.count('PRO') feature['GRM_F51'] = tags_list.count('NUM') feature['number'] = key features.append(feature) return pd.DataFrame(features)
def search_obj_by_name(model, name): nor_name = utils.normalize_text(name).strip() objs_to_return = model.objects.filter(name__icontains=nor_name) if len(objs_to_return) == 1: return objs_to_return else: if len(objs_to_return) == 0: objs_to_return = search_most_similar_strings(model, nor_name) if len(objs_to_return) > 1: objs_to_return = [disambiguate_objs_by_name(objs_to_return, nor_name)] return objs_to_return
async def parse_details_component(html_content): meta_data = {} for i in html_content.findAll("div", class_="form-group"): key = i.find("strong").text.strip() meta_data[key] = {} keys = i.findAll("div", class_="col-xs-7") values = i.findAll("div", class_="col-xs-5") for k, v in zip(keys, values): k, v = k.text.strip(), v.text.strip() meta_data[key][k] = v text = "" for k, v in meta_data.items(): k = normalize_text(k) text += f"<{k}>" for k1, v1 in v.items(): k1 = normalize_text(k1) text += f"<{k1}>{v1}</{k1}>" text += f"</{k}>" return prettify(text, "details")
def tokenization(data, tokenizer): context = [] for i in (data): text = normalize_text(i) encode = tokenizer.encode(text) context.append(encode) return context
def load_wv_vocab(vocab_file): ''' 从词向量文件中加载词典 ''' vocab = set() with open(vocab_file, encoding='utf8') as f: for line in f: elems = line.split() token = normalize_text(''.join(elems[0:-args.embedding_dim])) vocab.add(token) return vocab
def get_or_create_player(tournament_obj, player_dict): player_name = utils.normalize_text(player_dict['name']) ret_obj = search_person_by_name(player_name) if not ret_obj: # the player doesn't exist yet player_obj = create_new_person(player_dict) elif len(ret_obj) > 1: player_obj = disambiguate_player(ret_obj, tournament_obj) else: player_obj = ret_obj[0] update_person(player_obj, player_dict) return player_obj
def get_psychological_features(dataset): features = [] for key, text in tqdm(dataset.items()): feature = {} normalized_text = utils.normalize_text(text) colors = [utils.normalize_text(color) for color in config.colors] rakiks = [utils.normalize_text(rakik) for rakik in config.rakik] doubt_phrases = [ utils.normalize_text(doubt_phrase) for doubt_phrase in config.doubt_phrase ] certain_phrases = [ utils.normalize_text(certain_phrase) for certain_phrase in config.certain_phrase ] # نشانه های زبانی - روانی pos_words, neg_words = data_loader.load_positive_negative_words( positive_words_path=data_path.POSITIVE_WORDS_PATH, negative_words_path=data_path.NEGATIVE_WORDS_PATH) # صفات مثبت feature['PSY_F36'] = utils.count_chars(normalized_text, pos_words) # صفات منفی feature['PSY_F37'] = utils.count_chars(normalized_text, neg_words) # رنگها feature['PSY_F38'] = utils.count_chars(normalized_text, colors) # کلمات رکیک feature['PSY_F39'] = utils.count_chars(normalized_text, rakiks) # شک و تردید feature['PSY_F47'] = utils.count_chars(normalized_text, doubt_phrases) # قطعیت feature['PSY_F48'] = utils.count_chars(normalized_text, certain_phrases) feature['number'] = key features.append(feature) return pd.DataFrame(features)
async def fetcher_profile(links): total_links = len(links) with alive_bar(total_links) as bar: data = await asyncio.gather( *[parse_profile_page_info(url.strip(), progress=bar) for url in links] ) for link_num, info in enumerate(data): if not info: continue folder_name = links[link_num].split("/")[-1] if not os.path.exists(folder_name): os.makedirs(folder_name) os.chdir(folder_name) with open("01 - details_page.xml", "a") as f: f.write("<item>\n") for k, v in info.items(): if isinstance(v, dict): for k1, v1 in v.items(): k1 = normalize_text(k1) if not isinstance(v1, dict): f.write(f"<{k1}>{v1}</{k1}>\n") else: for k2, v2 in v1.items(): k2 = normalize_text(k2) if not isinstance(v2, list): f.write(f"<{k2}>{v2}</{k2}>\n") else: for i in v2: f.write(f"<{k2}>{i}</{k2}>\n") else: f.write(f"<link>{v}</link>\n") f.write("</item>\n\n") os.chdir("..") await client.aclose()
def get_or_create_region(region_dict): region_name = utils.normalize_text(region_dict['name']) ret_objs = search_obj_by_name(Region, region_name) if not ret_objs: # the region doesn't exist, the default country for # all regions will be paraguay country = Country.objects.get(name__iexact=DEF_COUNTRY) region_obj = create_new_region(region_dict) elif len(ret_objs) > 1: raise Exception('Got more than one region') else: region_obj = ret_objs[0] return region_obj
def get_or_create_stadium(stadium_dict, city_dict): stadium_name = utils.normalize_text(stadium_dict['name']) # delete word 'estadio' stadium_name = stadium_name.replace('estadio', '').strip() ret_obj = search_obj_by_name(Stadium, stadium_name) if not ret_obj: # the stadium doesn't exist yet city = get_or_create_city(city_dict) stadium_obj = create_new_stadium(stadium_dict, city) elif len(ret_obj) > 0: raise Exception('Got more than one stadium') else: stadium_obj = ret_obj[0] update_stadium(stadium_obj, stadium_dict) return stadium_obj
def get_or_create_city(city_dict): city_name = utils.normalize_text(city_dict['name']) ret_obj = search_obj_by_name(City, city_name) if not ret_obj: # the city doesn't exist, the default country for # all cities will be paraguay country = Country.objects.get(name__iexact=DEF_COUNTRY) if 'region' in city_dict: region = get_or_create_region(city_dict['region']) else: region = None city_obj = create_new_city(city_dict, region) elif len(ret_obj) > 1: raise Exception('Got more than one city') else: city_obj = ret_obj[0] update_city(city_obj, city_dict) return city_obj
def filedump(self, description, contents, slaveid=None, mode="w", contents_base64=False, display_type="primary", display_glyph=None, file_type=None, dont_write=False, os_filename=None, group_id=None, test_name=None, test_location=None): if slaveid is not None: if not slaveid: slaveid = "Master" test_ident = "{}/{}".format(self.store[slaveid]['test_location'], self.store[slaveid]['test_name']) else: test_ident = "{}/{}".format(test_location, test_name) artifacts = [] if os_filename is None: safe_name = re.sub(r"\s+", "_", normalize_text(safe_string(description))) os_filename = self.ident + "-" + safe_name os_filename = os.path.join(self.store[slaveid]['artifact_path'], os_filename) if file_type is not None and "screenshot" in file_type: os_filename = os_filename + ".png" elif file_type is not None and ( "_tb" in file_type or "traceback" in file_type or file_type == "log"): os_filename = os_filename + ".log" elif file_type is not None and file_type == "html": os_filename = os_filename + ".html" elif file_type is not None and file_type == "video": os_filename = os_filename + ".ogv" else: os_filename = os_filename + ".txt" artifacts.append({ "file_type": file_type, "display_type": display_type, "display_glyph": display_glyph, "description": description, "os_filename": os_filename, "group_id": group_id, }) if not dont_write: if os.path.isfile(os_filename): os.remove(os_filename) with open(os_filename, mode) as f: if contents_base64: contents = base64.b64decode(contents) f.write(contents) return None, {'artifacts': {test_ident: {'files': artifacts}}}
def make_prediction(content): logger.info("title: %s", content['title']) logger.info("text: %s", content['text']) title = content['title'] text = content['text'][:2000] text = ',%s.%s' % (title, text) # clean text text = utils.normalize_text(text) labels = classifier.predict_proba([text], k=5)[0] labels = map(lambda (k, v): {'rating': k, 'score': v}, labels) logger.info("prediction: %s", labels) result = {'status': 0, 'prediction': labels} return result
def test_provision_approval( setup_provider, provider, provisioning, vm_name, smtp_test, request, edit): """ Tests provisioning approval. Tests couple of things. * Approve manually * Approve by editing the request to conform Prerequisities: * A provider that can provision. * Automate role enabled * User with e-mail set so you can receive and view them Steps: * Create a provisioning request that does not get automatically approved (eg. ``num_vms`` bigger than 1) * Wait for an e-mail to come, informing you that the auto-approval was unsuccessful. * Depending on whether you want to do manual approval or edit approval, do: * MANUAL: manually approve the request in UI * EDIT: Edit the request in UI so it conforms the rules for auto-approval. * Wait for an e-mail with approval * Wait until the request finishes * Wait until an email, informing about finished provisioning, comes. Metadata: test_flag: provision suite: infra_provisioning """ # generate_tests makes sure these have values template, host, datastore = map(provisioning.get, ('template', 'host', 'datastore')) # It will provision two of them vm_names = [vm_name + "001", vm_name + "002"] request.addfinalizer( lambda: [cleanup_vm(vmname, provider) for vmname in vm_names]) provisioning_data = { 'vm_name': vm_name, 'host_name': {'name': [host]}, 'datastore_name': {'name': [datastore]}, 'num_vms': "2", } # Same thing, different names. :\ if provider.type == 'rhevm': provisioning_data['provision_type'] = 'Native Clone' elif provider.type == 'virtualcenter': provisioning_data['provision_type'] = 'VMware' try: provisioning_data['vlan'] = provisioning['vlan'] except KeyError: # provisioning['vlan'] is required for rhevm provisioning if provider.type == 'rhevm': raise pytest.fail('rhevm requires a vlan value in provisioning info') do_vm_provisioning(template, provider, vm_name, provisioning_data, request, smtp_test, wait=False) wait_for( lambda: len(filter( lambda mail: "your request for a new vms was not autoapproved" in normalize_text(mail["subject"]), smtp_test.get_emails())) > 0, num_sec=90, delay=5) wait_for( lambda: len(filter( lambda mail: "virtual machine request was not approved" in normalize_text(mail["subject"]), smtp_test.get_emails())) > 0, num_sec=90, delay=5) cells = {'Description': 'Provision from [{}] to [{}###]'.format(template, vm_name)} wait_for(lambda: requests.go_to_request(cells), num_sec=80, delay=5) if edit: # Automatic approval after editing the request to conform with requests.edit_request(cells) as form: fill(form.num_vms, "1") new_vm_name = vm_name + "_xx" fill(form.vm_name, new_vm_name) vm_names = [new_vm_name] # Will be just one now cells = {'Description': 'Provision from [{}] to [{}]'.format(template, new_vm_name)} check = "vm provisioned successfully" request.addfinalizer( lambda: cleanup_vm(new_vm_name, provider)) else: # Manual approval requests.approve_request(cells, "Approved") vm_names = [vm_name + "001", vm_name + "002"] # There will be two VMs request.addfinalizer( lambda: [cleanup_vm(vmname, provider) for vmname in vm_names]) check = "request complete" wait_for( lambda: len(filter( lambda mail: "your virtual machine configuration was approved" in normalize_text(mail["subject"]), smtp_test.get_emails())) > 0, num_sec=120, delay=5) # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup logger.info( 'Waiting for vms "{}" to appear on provider {}'.format( ", ".join(vm_names), provider.key)) wait_for( lambda: all(map(provider.mgmt.does_vm_exist, vm_names)), handle_exception=True, num_sec=600) row, __ = wait_for(requests.wait_for_request, [cells], fail_func=requests.reload, num_sec=1500, delay=20) assert normalize_text(row.last_message.text) == check # Wait for e-mails to appear def verify(): return ( len(filter( lambda mail: "your virtual machine request has completed vm {}".format(normalize_text(vm_name)) in normalize_text(mail["subject"]), smtp_test.get_emails())) == len(vm_names) ) wait_for(verify, message="email receive check", delay=5)
if __name__ == '__main__': logging.basicConfig(level=logging.INFO) filenames = ['{}.json'.format(os.path.join(data_dir, name)) for name in event_names] # data = [data_[:10] for data_ in load_data(filenames).values()] data = load_data(filenames, event_names) # import IPython; IPython.embed() documents = [] doc_id = 0 for event_name, data_ in data.iteritems(): for text in data_: text = normalize_text(text) # if 'euro2016' not in text: # continue doc = document_type(text.split(), [doc_id], event=event_name) documents.append(doc) doc_id += 1 logging.info('number of documents: %i', len(documents)) random.shuffle(documents) # documents = documents[:1000] # for testing model = train_model(documents, model_filename) with open(data_filename, 'wb') as f: cPickle.dump(documents, f) # import IPython; IPython.embed()
def test_provision_approval(provider_init, provider_key, provider_crud, provider_type, provider_mgmt, provisioning, vm_name, smtp_test, request): """ Tests provisioning approval Metadata: test_flag: provision suite: infra_provisioning """ # generate_tests makes sure these have values template, host, datastore = map(provisioning.get, ('template', 'host', 'datastore')) # It will provision two of them vm_names = [vm_name + "001", vm_name + "002"] request.addfinalizer( lambda: [cleanup_vm(vmname, provider_key, provider_mgmt) for vmname in vm_names]) provisioning_data = { 'vm_name': vm_name, 'host_name': {'name': [host]}, 'datastore_name': {'name': [datastore]}, 'num_vms': "2", } # Same thing, different names. :\ if provider_type == 'rhevm': provisioning_data['provision_type'] = 'Native Clone' elif provider_type == 'virtualcenter': provisioning_data['provision_type'] = 'VMware' try: provisioning_data['vlan'] = provisioning['vlan'] except KeyError: # provisioning['vlan'] is required for rhevm provisioning if provider_type == 'rhevm': raise pytest.fail('rhevm requires a vlan value in provisioning info') do_vm_provisioning(template, provider_crud, vm_name, provisioning_data, request, provider_mgmt, provider_key, smtp_test, wait=False) wait_for( lambda: len(filter( lambda mail: "your request for a new vms was not autoapproved" in normalize_text(mail["subject"]), smtp_test.get_emails())) > 0, num_sec=90, delay=5) wait_for( lambda: len(filter( lambda mail: "virtual machine request was not approved" in normalize_text(mail["subject"]), smtp_test.get_emails())) > 0, num_sec=90, delay=5) cells = {'Description': 'Provision from [{}] to [{}###]'.format(template, vm_name)} wait_for(lambda: requests.go_to_request(cells), num_sec=80, delay=5) requests.approve_request(cells, "Approved") wait_for( lambda: len(filter( lambda mail: "your virtual machine configuration was approved" in normalize_text(mail["subject"]), smtp_test.get_emails())) > 0, num_sec=90, delay=5) # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup logger.info( 'Waiting for vms "{}" to appear on provider {}'.format( ", ".join(vm_names), provider_crud.key)) wait_for( lambda: all(map(provider_mgmt.does_vm_exist, vm_names)), handle_exception=True, num_sec=600) row, __ = wait_for(requests.wait_for_request, [cells], fail_func=requests.reload, num_sec=1500, delay=20) assert normalize_text(row.last_message.text) == "request complete" # Wait for e-mails to appear def verify(): return ( len(filter( lambda mail: "your virtual machine request has completed vm {}".format(normalize_text(vm_name)) in normalize_text(mail["subject"]), smtp_test.get_emails())) == 2 ) wait_for(verify, message="email receive check", delay=5)