def test_provision_from_template(provider, testing_instance, soft_assert):
    """ Tests instance provision from template

    Metadata:
        test_flag: provision
    """
    instance, inst_args, image = testing_instance
    instance.create(**inst_args)
    logger.info('Waiting for cfme provision request for vm %s', instance.name)
    row_description = 'Provision from [{}] to [{}]'.format(image, instance.name)
    cells = {'Description': row_description}
    try:
        row, __ = wait_for(requests.wait_for_request, [cells],
                           fail_func=requests.reload, num_sec=1500, delay=20)
    except Exception as e:
        requests.debug_requests()
        raise e
    assert normalize_text(row.status.text) == 'ok' and \
        normalize_text(row.request_state.text) == 'finished', \
        "Provisioning failed with the message {}".format(row.last_message.text)
    instance.wait_to_appear(timeout=800)
    provider.refresh_provider_relationships()
    logger.info("Refreshing provider relationships and power states")
    refresh_timer = RefreshTimer(time_for_refresh=300)
    wait_for(provider.is_refreshed,
             [refresh_timer],
             message="is_refreshed",
             num_sec=1000,
             delay=60,
             handle_exception=True)
    soft_assert(instance.does_vm_exist_on_provider(), "Instance wasn't provisioned")
Exemple #2
0
def test_provision_from_template(request, setup_provider, provider,
                                 testing_instance, soft_assert):
    """ Tests instance provision from template

    Metadata:
        test_flag: provision
    """
    instance, inst_args, image = testing_instance
    instance.create(**inst_args)
    logger.info('Waiting for cfme provision request for vm %s', instance.name)
    row_description = 'Provision from [{}] to [{}]'.format(
        image, instance.name)
    cells = {'Description': row_description}
    try:
        row, __ = wait_for(requests.wait_for_request, [cells],
                           fail_func=requests.reload,
                           num_sec=1500,
                           delay=20)
    except Exception as e:
        requests.debug_requests()
        raise e
    assert normalize_text(row.status.text) == 'ok' and \
        normalize_text(row.request_state.text) == 'finished', \
        "Provisioning failed with the message {}".format(row.last_message.text)
    instance.wait_to_appear(timeout=800)
    provider.refresh_provider_relationships()
    logger.info("Refreshing provider relationships and power states")
    refresh_timer = RefreshTimer(time_for_refresh=300)
    wait_for(provider.is_refreshed, [refresh_timer],
             message="is_refreshed",
             num_sec=1000,
             delay=60,
             handle_exception=True)
    soft_assert(instance.does_vm_exist_on_provider(),
                "Instance wasn't provisioned")
Exemple #3
0
 def verify():
     return (
         len(filter(
             lambda mail:
             "your virtual machine request has completed vm {}".format(normalize_text(vm_name))
             in normalize_text(mail["subject"]),
             smtp_test.get_emails())) == len(vm_names)
     )
Exemple #4
0
def input_load(mode="train"):
    """
    Load the input text and the corresponding feature labels

    :param mode: whether to gather data for training and evaluation or for synthesis
    :return: the text labels, the text lengths, and the audio file paths
    """
    # creates vocab conversion dictionaries
    char2idx, _ = create_vocab()
    fpaths, text_lengths, texts = [], [], []

    # the path to the dataset
    base_path = os.path.join(DATA_PATH, 'wavs')
    # the path to the text
    transcript = os.path.join(DATA_PATH, 'metadata.csv')

    # training or evaluation
    if mode in ("train", "eval"):
        # Each epoch
        for _ in range(NUM_EPOCHS):
            # open the text file
            lines = codecs.open(transcript, 'r', ENCODING).readlines()
            for line in lines:
                fname, _, text = line.strip().split("|")

                # get the wav file paths
                fpath = os.path.join(base_path, fname + ".wav")
                fpaths.append(fpath)

                # clean and normalize the text
                text = normalize_text(text) + "$"  # E: EOS
                text = [char2idx[char] for char in text]
                text_lengths.append(len(text))
                texts.append(np.array(text, np.int32).tostring())
        return fpaths, text_lengths, texts
    else:  # synthesis

        # Parse
        lines = codecs.open(TEST_DATA, 'r', 'utf-8').readlines()[1:]

        # Normalize text: $ is EOS
        sents = [
            normalize_text(line.split(" ", 1)[-1]).strip() + "$"
            for line in lines
        ]
        lengths = [len(sent) for sent in sents]
        maxlen = sorted(lengths, reverse=True)[0]

        # Pad the text
        texts = np.zeros((len(sents), maxlen), np.int32)
        for i, sent in enumerate(sents):
            texts[i, :len(sent)] = [char2idx[char] for char in sent]
        # return just the text, no lengths or paths needed
        return texts
def do_vm_provisioning(template_name, provider, vm_name, provisioning_data, request,
                       smtp_test, num_sec=1500, wait=True):
    # generate_tests makes sure these have values
    sel.force_navigate('infrastructure_provision_vms', context={
        'provider': provider,
        'template_name': template_name,
    })

    note = ('template {} to vm {} on provider {}'.format(template_name, vm_name, provider.key))
    provisioning_data.update({
        'email': '*****@*****.**',
        'first_name': 'Template',
        'last_name': 'Provisioner',
        'notes': note,
    })

    fill(provisioning_form, provisioning_data,
         action=provisioning_form.submit_button)
    flash.assert_no_errors()
    if not wait:
        return

    # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup
    logger.info('Waiting for vm %s to appear on provider %s', vm_name, provider.key)
    wait_for(provider.mgmt.does_vm_exist, [vm_name], handle_exception=True, num_sec=600)

    # nav to requests page happens on successful provision
    logger.info('Waiting for cfme provision request for vm %s', vm_name)
    row_description = 'Provision from [{}] to [{}]'.format(template_name, vm_name)
    cells = {'Description': row_description}
    try:
        row, __ = wait_for(requests.wait_for_request, [cells],
                           fail_func=requests.reload, num_sec=num_sec, delay=20)
    except Exception as e:
        requests.debug_requests()
        raise e
    assert normalize_text(row.status.text) == 'ok' \
                                              and normalize_text(
        row.request_state.text) == 'finished'

    if smtp_test:
        # Wait for e-mails to appear
        def verify():
            if current_version() >= "5.4":
                approval = dict(subject_like="%%Your Virtual Machine configuration was Approved%%")
            else:
                approval = dict(text_like="%%Your Virtual Machine Request was approved%%")
            expected_text = "Your virtual machine request has Completed - VM:%%{}".format(vm_name)
            return (
                len(smtp_test.get_emails(**approval)) > 0 and
                len(smtp_test.get_emails(subject_like=expected_text)) > 0
            )

        wait_for(verify, message="email receive check", delay=5)
Exemple #6
0
    def load_dataset(self,
                     dataset,
                     columns=None,
                     drop_nan=True,
                     reset_data=True):
        self.vprint('loading dataset {0}'.format(
            dataset if isinstance(dataset, str) else 'from pandas DataFrame'))

        if isinstance(dataset, str):
            dataset = pd.read_csv(
                dataset, header=0
            )  # read csv assuming first line has header text. TODO handle files w/o headers
        else:
            assert isinstance(dataset, pd.DataFrame)

        headers = dataset.columns.values
        if columns:
            text_df = dataset[columns]
        else:
            text_df = dataset.select_dtypes([
                'object'
            ])  # drop non-text rows (pandas strings are of type 'object')
        # TODO confirm that the columns selected can't be cast to a numeric type to avoid numeric strings (e.g. '1')

        dtype_dropped = get_dropped(headers, text_df.columns.values)
        self.vprint('\ndropped non-text columns: {0}'.format(
            list(dtype_dropped)))

        if drop_nan:  # drop columns if there are any missing values
            # TODO handle missing values w/o dropping whole column
            text_df = text_df.dropna(axis=1, how='any')
            nan_dropped = get_dropped(headers, text_df.columns.values)
            nan_dropped = nan_dropped.difference(dtype_dropped)
            if nan_dropped:
                self.vprint(
                    '\ndropped columns with missing values: {0}'.format(
                        list(nan_dropped)))

        if not reset_data:
            # TODO implement variant where data is appended instead of overwritten
            raise Exception('not implemented')

        self.data = {}
        self.vprint('\nnormalizing headers')
        self.data['headers'] = self.format_data(headers)

        for col in text_df.columns.values:
            self.vprint('\nnormalizing column: {0}'.format(
                normalize_text(col, to_list=False)))
            self.data[normalize_text(col, to_list=False)] = self.format_data(
                text_df[col].values)

        return self.data
def __get_actual_affiliations(affiliations, author_affiliation):
    actual_affiliations = []
    author_affiliation = author_affiliation.strip()
    for affiliation in affiliations:
        affiliation = affiliation.strip()
        # normalize text before comparison
        affiliation_nor = normalize_text(affiliation)
        author_affiliation_nor = normalize_text(author_affiliation)
        if affiliation_nor.lower() in author_affiliation_nor.lower():
            if affiliation:
                actual_affiliations.append(affiliation.strip())
    return actual_affiliations
Exemple #8
0
def get_word_dependent_features(dataset):
    features = []
    for key, text in tqdm(dataset.items()):
        feature = {}
        normalized_text = utils.normalize_text(text)
        words, _ = utils.tokenize_word_and_sentence(normalized_text)

        C = len(normalized_text)
        N = len(words)

        # #ویژگی های مبتنی بر واژه
        # تعداد کل کلمات
        feature['WD_F7'] = N
        #میانگین تعداد نویسه در هر کلمه
        feature['WD_F8'] = C / N
        # غنای واژگانی (کل کلمات یکتا تقسیم بر تعداد کل کلمه ها)
        V = set(words)
        feature['WD_F9'] = len(V) / N
        #N/ کلمات طولانی (بزرگ تر از 3 نویسه)
        long_words = [w for w in words if len(w) >= 3]
        feature['WD_F10'] = len(long_words) / N
        #N/کلمات کوچک تر از 2 نویسه
        short_words = [w for w in words if len(w) <= 2]
        feature['WD_F11'] = len(short_words) / N
        #N/کلمات 1 تکراره
        counts = Counter(words)
        unique_words = [w for w in words if counts[w] == 1]
        feature['WD_F12'] = len(unique_words)
        #N/کلمات 2 تکراره
        double_words = [w for w in words if counts[w] == 2]
        feature['WD_F13'] = len(double_words)
        # معیار k یول
        yules_k = 10000 * (-1 * (1.0 / N) + sum(
            list([(len(list(w for w in V if counts[w] == i))) * ((i / N)**2)
                  for i in range(1,
                                 len(V) + 1)])))
        feature['WD_F14'] = yules_k
        # معیار D سیمپسون
        simpsons_d = sum((len(list(w for w in V if counts[w] == i))) *
                         (i / N) * ((i - 1) / (N - 1))
                         for i in range(1, len(V)))
        feature['WD_F15'] = simpsons_d
        # معیار S سیشل
        sichels_s = len(double_words) / len(V)
        feature['WD_F16'] = sichels_s
        # معیار R هونور
        delimiter = 1 - len(unique_words) / len(V)
        if delimiter == 0:
            delimiter = 0.0001
        honores_R = (100 * math.log(N)) / (delimiter)
        feature['WD_F17'] = honores_R
        #معیار انتروپی
        entorpy = sum((len(list(w for w in V if counts[w] == i))) * (i / N) *
                      (-1 * math.log(i / N)) for i in range(1, len(V)))
        feature['WD_F50'] = entorpy

        feature['number'] = key

        features.append(feature)
    return pd.DataFrame(features)
def load_embeddings(vocab,filepath,embedding_size):
    '''
    input:
        vocab: the vocab of squad
        filepath: glove file path
        embedding_size: embedding dim
    return:
        embeddings: the whole embedding
        fixed_embeddings: the fixed embedding
    '''
    embeddings = np.random.normal(0.00,1.00,[len(vocab),embedding_size])
    count = 0
    with open(filepath,"r",encoding="utf8") as f:               
        for line in f:
            word = line.rstrip().split(' ')[0]
            word = utils.normalize_text(word)
            if(word in vocab):
                count += 1
                vec = line.strip().split(" ")[1:]
                vec = np.array(vec)
                embeddings[vocab[word]] = vec
    embeddings[vocab["__NULL__"]] = np.zeros(embedding_size)
    fixed_embeddings = embeddings[FIXED_EMBEDDING_NUM+3:]
    print('the glove count:',count)
    return embeddings,fixed_embeddings
Exemple #10
0
def get_structural_features(dataset):
    features = []
    for key, text in tqdm(dataset.items()):
        feature = {}
        normalized_text = utils.normalize_text(text)
        words, sentences = utils.tokenize_word_and_sentence(normalized_text)
        N = len(words)
        S = len(sentences)

        # # ویژگی های ساختاری
        # # تعداد کل خط
        feature['STR_F25'] = text.count('\n')
        # تعداد کل جملات = S
        feature['STR_F26'] = S
        # تعداد کلمه در هر جمله (میانگین)
        feature['STR_F27'] = N/S

        empty_lines = text.replace(" ", "").count('\n\n')
        # کل خطوط / تعداد خطوط خالی
        total_lines = text.replace(" " , "").count('\n') + 1
        feature['STR_F28'] = empty_lines/total_lines
        # میانگین خطوط غیر خالی
        feature['STR_F29'] = len(text)/(total_lines-empty_lines)

        feature['number'] = key

        features.append(feature)
    return pd.DataFrame(features)
Exemple #11
0
def get_text_dependent_features(dataset):
    features = []
    for key, text in tqdm(dataset.items()):
        feature = {}
        normalized_text = utils.normalize_text(text)
        # words, _ = utils.tokenize_word_and_sentence(normalized_text)

        alphabets_in_texts = [i for i in Counter(normalized_text) if i in config.alphabet]
        C = len(normalized_text)
        # N = len(words)

        feature['TD_F1'] = C
        # تعداد کل حروف الفبا / C
        feature['TD_F2'] = len(alphabets_in_texts)/C
        # # تعداد حروف الفبا
        feature['TD_F49'] = len(alphabets_in_texts)
        # تعداد کل اعداد
        feature['TD_F3'] = utils.count_chars(normalized_text, config.numbers)
        # تعداد نویسه فاصله
        feature['TD_F4'] = text.count(' ')/C
        # تعداد نویسه تب
        feature['TD_F5'] = text.count('\t')/C
        # تعداد نویسه ویژه
        feature['TD_F6'] = utils.count_chars(normalized_text , config.special_chars)/C

        feature['number'] = key

        features.append(feature)
    return pd.DataFrame(features)
Exemple #12
0
def get_syntactics_features(dataset):
    features = []
    for key, text in tqdm(dataset.items()):
        feature = {}
        normalized_text = utils.normalize_text(text)
        words, _ = utils.tokenize_word_and_sentence(normalized_text)

        C = len(normalized_text)
        # # ویژگی های نحوی
        # C/تعداد کاما
        feature['SYN_F18'] = text.count('،') / C
        # C/تعداد نقطه
        feature['SYN_F19'] = text.count('.') / C
        # C/تعداد دو نقطه
        feature['SYN_F20'] = text.count(':') / C
        # C/تعداد سمیکلون
        feature['SYN_F21'] = text.count(';') / C
        # C/تعداد علامت سوال
        feature['SYN_F22'] = text.count('؟') / C
        # C/تعداد علامت تعجب
        feature['SYN_F23'] = text.count('!') / C
        # C/تعداد علامت تعجب سه تایی
        feature['SYN_F24'] = text.count('!!!') / C

        feature['number'] = key

        features.append(feature)
    return pd.DataFrame(features)
Exemple #13
0
    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        x, y = [], []
        is_heads = []
        for w, t in zip(words, tags):
            w = normalize_text(w)
            tokens = self.tokenizer.tokenize(w) if w not in ("[CLS]",
                                                             "[SEP]") else [w]
            xx = self.tokenizer.convert_tokens_to_ids(tokens)
            # assert len(tokens) == len(xx), f"len(tokens)={len(tokens)}, len(xx)={len(xx)}"

            # 中文没有英文wordpiece后分成几块的情况
            is_head = [1] + [0] * (len(xx) - 1)

            # t = [t] + ['<PAD>'] * (len(tokens) - 1)

            yy = [self.tag2idx[t]]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x) == len(y) == len(
            is_heads
        ), f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}"

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen
    def dl_model_classify(self, text, normalize=True):
        """
        classify the given text to the appropriate classes "multi class for each text using the deep learning model"
        Args:
            text (str): represents text that we want to classify.
            normalize(bool): represents a flag if it true we will normalise the text before pass it to the model.
            Default True.
        Returns:
            list of 11 items "class" contains 0's and 1's, 1 means you could map the text to this class and 0 means that
            you couldn't map the text to this class.
            [
            'فن ومشاهير',
            'أخبار',
            'رياضة',
            'اقتصاد',
            'تكنولوجيا',
            'اسلام و أديان',
            'سيارات',
            'طقس',
            'منوعات أخرى',
            'صحة',
            'مطبخ'
        ]
        """
        if normalize:
            text = [normalize_text(text)]

        embedded_text = self.dl_vectorizer.embed_batch(text,
                                                       MAX_SENTENCE_LENGTH)
        predictions = (self.dl_model.predict(embedded_text) > 0.5).astype(int)
        return predictions.tolist()[0]
Exemple #15
0
async def parse_facts_component(html_content):
    meta_data = {}
    elem = html_content.find("ol", class_="chart")

    meta_data[html_content.find("strong").text.strip()] = [
        i.text.strip() for i in elem.findAll("li")
    ]

    elem_2 = html_content.findAll("div", class_="form-group")[1]

    meta_data[elem_2.find("strong").text.strip()] = [
        i.text.strip() for i in elem_2.findAll("li")
    ]

    text = ""
    for k, v in meta_data.items():
        k = normalize_text(k)
        text += f"<{k}>"
        for i in v:
            if not i:
                continue
            text += "<value>"
            text += f"{i}"
            text += "</value>"
        text += f"</{k}>"

    return prettify(text, "facts")
Exemple #16
0
def do_vm_provisioning(
    template_name, provider, vm_name, provisioning_data, request, smtp_test, num_sec=1500, wait=True
):
    # generate_tests makes sure these have values
    vm = Vm(name=vm_name, provider=provider, template_name=template_name)
    navigate_to(vm, "ProvisionVM")

    note = "template {} to vm {} on provider {}".format(template_name, vm_name, provider.key)
    provisioning_data.update(
        {
            "email": "*****@*****.**",
            "first_name": "Template",
            "last_name": "Provisioner",
            "notes": note,
        }
    )

    fill(provisioning_form, provisioning_data, action=provisioning_form.submit_button)
    flash.assert_no_errors()
    if not wait:
        return

    # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup
    logger.info("Waiting for vm %s to appear on provider %s", vm_name, provider.key)
    wait_for(provider.mgmt.does_vm_exist, [vm_name], handle_exception=True, num_sec=600)

    # nav to requests page happens on successful provision
    logger.info("Waiting for cfme provision request for vm %s", vm_name)
    row_description = "Provision from [{}] to [{}]".format(template_name, vm_name)
    cells = {"Description": row_description}
    try:
        row, __ = wait_for(requests.wait_for_request, [cells], fail_func=requests.reload, num_sec=num_sec, delay=20)
    except Exception as e:
        requests.debug_requests()
        raise e
    assert normalize_text(row.status.text) == "ok" and normalize_text(row.request_state.text) == "finished"

    if smtp_test:
        # Wait for e-mails to appear
        def verify():
            approval = dict(subject_like="%%Your Virtual Machine configuration was Approved%%")
            expected_text = "Your virtual machine request has Completed - VM:%%{}".format(vm_name)
            return (
                len(smtp_test.get_emails(**approval)) > 0 and len(smtp_test.get_emails(subject_like=expected_text)) > 0
            )

        wait_for(verify, message="email receive check", delay=5)
def __affiliations_to_save(affiliations, new_affiliations):
    jarowinkler = JaroWinkler()
    similarity_threshold = 0.95
    affiliations_to_save = []
    for new_affiliation in new_affiliations:
        exist_affiliation = False
        for affiliation in affiliations:
            # normalize text before comparison
            affiliation_nor = normalize_text(affiliation)
            new_affiliation_nor = normalize_text(new_affiliation)
            similarity_score = jarowinkler.similarity(
                affiliation_nor.lower(), new_affiliation_nor.lower())
            if similarity_score >= similarity_threshold:
                exist_affiliation = True
        if not exist_affiliation:
            affiliations_to_save.append(new_affiliation)
    return affiliations_to_save
def load_glovedict(filepath):
    embedding_words = set()
    with open(filepath,"r",encoding="utf8") as f:
        for line in f:
            w = utils.normalize_text(line.rstrip().split(' ')[0])
            embedding_words.add(w)
    print('Num words in glove dict is',len(embedding_words))
    return embedding_words
Exemple #19
0
def disambiguate_objs_by_name(objs, ref_name):
    similarity = []
    for obj in objs:
        nor_name = utils.normalize_text(obj.name)
        names = itertools.izip_longest(nor_name, ref_name)
        similarity.append(len([c1 for c1, c2 in names if c1 == c2]))
    idx_max_sim = similarity.index(max(similarity))
    return objs[idx_max_sim]
Exemple #20
0
def get_sentences(dataset):
    sentences = []
    for text in dataset.values():
        normalized_text = utils.normalize_text(text)
        words, _ = utils.tokenize_word_and_sentence(normalized_text,
                                                    include_special_chars=True)
        sentences.append(words)
    return sentences
def extract_keywords(text, language):
    normalized_text = utils.normalize_text(text)
    sentences_list = utils.split_sentence(normalized_text)
    stopwords_regex = __get_stopwords_regex(language)
    keywords_candidate = __extract_keywords_candidate(sentences_list, stopwords_regex)
    word_scores = __calculate_word_scores(keywords_candidate)
    keywords_scores = __calculate_keyword_scores(keywords_candidate, word_scores)
    keywords_selected = __selected_keywords(keywords_scores)
    return keywords_selected
Exemple #22
0
    def filedump(self,
                 description,
                 contents,
                 slaveid=None,
                 mode="w",
                 contents_base64=False,
                 display_type="primary",
                 display_glyph=None,
                 file_type=None,
                 dont_write=False,
                 os_filename=None,
                 group_id=None,
                 test_name=None,
                 test_location=None):
        if slaveid is not None:
            if not slaveid:
                slaveid = "Master"
            test_ident = "{}/{}".format(self.store[slaveid]['test_location'],
                                        self.store[slaveid]['test_name'])
        else:
            test_ident = "{}/{}".format(test_location, test_name)
        artifacts = []
        if os_filename is None:
            safe_name = re.sub(r"\s+", "_",
                               normalize_text(safe_string(description)))
            os_filename = self.ident + "-" + safe_name
            os_filename = os.path.join(self.store[slaveid]['artifact_path'],
                                       os_filename)
            if file_type is not None and "screenshot" in file_type:
                os_filename = os_filename + ".png"
            elif file_type is not None and ("_tb" in file_type
                                            or "traceback" in file_type
                                            or file_type == "log"):
                os_filename = os_filename + ".log"
            elif file_type is not None and file_type == "html":
                os_filename = os_filename + ".html"
            elif file_type is not None and file_type == "video":
                os_filename = os_filename + ".ogv"
            else:
                os_filename = os_filename + ".txt"
        artifacts.append({
            "file_type": file_type,
            "display_type": display_type,
            "display_glyph": display_glyph,
            "description": description,
            "os_filename": os_filename,
            "group_id": group_id,
        })
        if not dont_write:
            if os.path.isfile(os_filename):
                os.remove(os_filename)
            with open(os_filename, mode) as f:
                if contents_base64:
                    contents = base64.b64decode(contents)
                f.write(contents)

        return None, {'artifacts': {test_ident: {'files': artifacts}}}
def get_grammatical_features(dataset):
    features = []
    for key, text in tqdm(dataset.items()):
        feature = {}
        normalized_text = utils.normalize_text(text)
        tagger = hazm.POSTagger(model=data_path.POSTAGGER_MODEL_PATH)
        tags = tagger.tag(hazm.word_tokenize(normalized_text))
        tags_list = [i[1] for i in tags]

        sounds = [utils.normalize_text(sound) for sound in config.sounds]
        group_pros = [
            utils.normalize_text(group_pro) for group_pro in config.group_pro
        ]
        conjunctions = [
            utils.normalize_text(conjunction)
            for conjunction in config.conjunctions
        ]
        subjective_pronounces = [
            utils.normalize_text(subjective_pronounce)
            for subjective_pronounce in config.subjective_pronounce
        ]

        feature['GRM_F30'] = utils.count_chars(text, subjective_pronounces)

        feature['GRM_F31'] = utils.count_chars(text, config.question)

        feature['GRM_F32'] = utils.count_chars(text, conjunctions)

        feature['GRM_F33'] = utils.count_chars(text, group_pros)

        feature['GRM_F34'] = utils.count_chars(text, sounds)

        feature['GRM_F35'] = tags_list.count('P') + tags_list.count('POSTP')

        feature['GRM_F40'] = tags_list.count('AJ')
        feature['GRM_F41'] = tags_list.count('ADV')
        feature['GRM_F42'] = tags_list.count('PRO')
        feature['GRM_F51'] = tags_list.count('NUM')

        feature['number'] = key

        features.append(feature)
    return pd.DataFrame(features)
Exemple #24
0
def search_obj_by_name(model, name):
    nor_name = utils.normalize_text(name).strip()
    objs_to_return = model.objects.filter(name__icontains=nor_name)
    if len(objs_to_return) == 1:
        return objs_to_return
    else:
        if len(objs_to_return) == 0:
            objs_to_return = search_most_similar_strings(model, nor_name)
        if len(objs_to_return) > 1:
            objs_to_return = [disambiguate_objs_by_name(objs_to_return, nor_name)]
    return objs_to_return
Exemple #25
0
async def parse_details_component(html_content):
    meta_data = {}
    for i in html_content.findAll("div", class_="form-group"):
        key = i.find("strong").text.strip()
        meta_data[key] = {}
        keys = i.findAll("div", class_="col-xs-7")
        values = i.findAll("div", class_="col-xs-5")
        for k, v in zip(keys, values):
            k, v = k.text.strip(), v.text.strip()
            meta_data[key][k] = v
    text = ""
    for k, v in meta_data.items():
        k = normalize_text(k)
        text += f"<{k}>"
        for k1, v1 in v.items():
            k1 = normalize_text(k1)
            text += f"<{k1}>{v1}</{k1}>"
        text += f"</{k}>"

    return prettify(text, "details")
def tokenization(data, tokenizer):

    context = []

    for i in (data):

        text = normalize_text(i)
        encode = tokenizer.encode(text)
        context.append(encode)

    return context
Exemple #27
0
def load_wv_vocab(vocab_file):
    '''
    从词向量文件中加载词典
    '''
    vocab = set()
    with open(vocab_file, encoding='utf8') as f:
        for line in f:
            elems = line.split()
            token = normalize_text(''.join(elems[0:-args.embedding_dim]))
            vocab.add(token)
    return vocab
Exemple #28
0
def get_or_create_player(tournament_obj, player_dict):
    player_name = utils.normalize_text(player_dict['name'])
    ret_obj = search_person_by_name(player_name)
    if not ret_obj:
        # the player doesn't exist yet
        player_obj = create_new_person(player_dict)
    elif len(ret_obj) > 1:
        player_obj = disambiguate_player(ret_obj, tournament_obj)
    else:
        player_obj = ret_obj[0]
    update_person(player_obj, player_dict)
    return player_obj
def get_psychological_features(dataset):
    features = []
    for key, text in tqdm(dataset.items()):
        feature = {}
        normalized_text = utils.normalize_text(text)

        colors = [utils.normalize_text(color) for color in config.colors]
        rakiks = [utils.normalize_text(rakik) for rakik in config.rakik]
        doubt_phrases = [
            utils.normalize_text(doubt_phrase)
            for doubt_phrase in config.doubt_phrase
        ]
        certain_phrases = [
            utils.normalize_text(certain_phrase)
            for certain_phrase in config.certain_phrase
        ]

        # نشانه های زبانی - روانی
        pos_words, neg_words = data_loader.load_positive_negative_words(
            positive_words_path=data_path.POSITIVE_WORDS_PATH,
            negative_words_path=data_path.NEGATIVE_WORDS_PATH)
        # صفات مثبت
        feature['PSY_F36'] = utils.count_chars(normalized_text, pos_words)
        # صفات منفی
        feature['PSY_F37'] = utils.count_chars(normalized_text, neg_words)
        # رنگ‌ها
        feature['PSY_F38'] = utils.count_chars(normalized_text, colors)
        # کلمات رکیک
        feature['PSY_F39'] = utils.count_chars(normalized_text, rakiks)
        # شک و تردید
        feature['PSY_F47'] = utils.count_chars(normalized_text, doubt_phrases)
        # قطعیت
        feature['PSY_F48'] = utils.count_chars(normalized_text,
                                               certain_phrases)

        feature['number'] = key

        features.append(feature)
    return pd.DataFrame(features)
Exemple #30
0
async def fetcher_profile(links):
    total_links = len(links)
    with alive_bar(total_links) as bar:
        data = await asyncio.gather(
            *[parse_profile_page_info(url.strip(), progress=bar) for url in links]
        )

        for link_num, info in enumerate(data):
            if not info:
                continue
            folder_name = links[link_num].split("/")[-1]

            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

            os.chdir(folder_name)

            with open("01 - details_page.xml", "a") as f:
                f.write("<item>\n")
                for k, v in info.items():
                    if isinstance(v, dict):
                        for k1, v1 in v.items():
                            k1 = normalize_text(k1)
                            if not isinstance(v1, dict):
                                f.write(f"<{k1}>{v1}</{k1}>\n")
                            else:
                                for k2, v2 in v1.items():
                                    k2 = normalize_text(k2)
                                    if not isinstance(v2, list):
                                        f.write(f"<{k2}>{v2}</{k2}>\n")
                                    else:
                                        for i in v2:
                                            f.write(f"<{k2}>{i}</{k2}>\n")
                    else:
                        f.write(f"<link>{v}</link>\n")
                f.write("</item>\n\n")
            os.chdir("..")

    await client.aclose()
Exemple #31
0
def get_or_create_region(region_dict):
    region_name = utils.normalize_text(region_dict['name'])
    ret_objs = search_obj_by_name(Region, region_name)
    if not ret_objs:
        # the region doesn't exist, the default country for
        # all regions will be paraguay
        country = Country.objects.get(name__iexact=DEF_COUNTRY)
        region_obj = create_new_region(region_dict)
    elif len(ret_objs) > 1:
        raise Exception('Got more than one region')
    else:
        region_obj = ret_objs[0]
    return region_obj
Exemple #32
0
def get_or_create_stadium(stadium_dict, city_dict):
    stadium_name = utils.normalize_text(stadium_dict['name'])
    # delete word 'estadio'
    stadium_name = stadium_name.replace('estadio', '').strip()
    ret_obj = search_obj_by_name(Stadium, stadium_name)
    if not ret_obj:
        # the stadium doesn't exist yet
        city = get_or_create_city(city_dict)
        stadium_obj = create_new_stadium(stadium_dict, city)
    elif len(ret_obj) > 0:
        raise Exception('Got more than one stadium')
    else:
        stadium_obj = ret_obj[0]
        update_stadium(stadium_obj, stadium_dict)
    return stadium_obj
Exemple #33
0
def get_or_create_city(city_dict):
    city_name = utils.normalize_text(city_dict['name'])
    ret_obj = search_obj_by_name(City, city_name)
    if not ret_obj:
        # the city doesn't exist, the default country for
        # all cities will be paraguay
        country = Country.objects.get(name__iexact=DEF_COUNTRY)
        if 'region' in city_dict:
            region = get_or_create_region(city_dict['region'])
        else:
            region = None
        city_obj = create_new_city(city_dict, region)
    elif len(ret_obj) > 1:
        raise Exception('Got more than one city')
    else:
        city_obj = ret_obj[0]
        update_city(city_obj, city_dict)
    return city_obj
Exemple #34
0
    def filedump(self, description, contents, slaveid=None, mode="w", contents_base64=False,
                 display_type="primary", display_glyph=None, file_type=None,
                 dont_write=False, os_filename=None, group_id=None, test_name=None,
                 test_location=None):
        if slaveid is not None:
            if not slaveid:
                slaveid = "Master"
            test_ident = "{}/{}".format(self.store[slaveid]['test_location'],
                self.store[slaveid]['test_name'])
        else:
            test_ident = "{}/{}".format(test_location, test_name)
        artifacts = []
        if os_filename is None:
            safe_name = re.sub(r"\s+", "_", normalize_text(safe_string(description)))
            os_filename = self.ident + "-" + safe_name
            os_filename = os.path.join(self.store[slaveid]['artifact_path'], os_filename)
            if file_type is not None and "screenshot" in file_type:
                os_filename = os_filename + ".png"
            elif file_type is not None and (
                    "_tb" in file_type or "traceback" in file_type or file_type == "log"):
                os_filename = os_filename + ".log"
            elif file_type is not None and file_type == "html":
                os_filename = os_filename + ".html"
            elif file_type is not None and file_type == "video":
                os_filename = os_filename + ".ogv"
            else:
                os_filename = os_filename + ".txt"
        artifacts.append({
            "file_type": file_type,
            "display_type": display_type,
            "display_glyph": display_glyph,
            "description": description,
            "os_filename": os_filename,
            "group_id": group_id,
        })
        if not dont_write:
            if os.path.isfile(os_filename):
                os.remove(os_filename)
            with open(os_filename, mode) as f:
                if contents_base64:
                    contents = base64.b64decode(contents)
                f.write(contents)

        return None, {'artifacts': {test_ident: {'files': artifacts}}}
Exemple #35
0
def make_prediction(content):
    logger.info("title: %s", content['title'])
    logger.info("text: %s", content['text'])

    title = content['title']
    text = content['text'][:2000]

    text = ',%s.%s' % (title, text)

    # clean text
    text = utils.normalize_text(text)

    labels = classifier.predict_proba([text], k=5)[0]

    labels = map(lambda (k, v): {'rating': k, 'score': v}, labels)

    logger.info("prediction: %s", labels)

    result = {'status': 0, 'prediction': labels}

    return result
def test_provision_approval(
        setup_provider, provider, provisioning, vm_name, smtp_test, request, edit):
    """ Tests provisioning approval. Tests couple of things.

    * Approve manually
    * Approve by editing the request to conform

    Prerequisities:
        * A provider that can provision.
        * Automate role enabled
        * User with e-mail set so you can receive and view them

    Steps:
        * Create a provisioning request that does not get automatically approved (eg. ``num_vms``
            bigger than 1)
        * Wait for an e-mail to come, informing you that the auto-approval was unsuccessful.
        * Depending on whether you want to do manual approval or edit approval, do:
            * MANUAL: manually approve the request in UI
            * EDIT: Edit the request in UI so it conforms the rules for auto-approval.
        * Wait for an e-mail with approval
        * Wait until the request finishes
        * Wait until an email, informing about finished provisioning, comes.

    Metadata:
        test_flag: provision
        suite: infra_provisioning
    """
    # generate_tests makes sure these have values
    template, host, datastore = map(provisioning.get, ('template', 'host', 'datastore'))

    # It will provision two of them
    vm_names = [vm_name + "001", vm_name + "002"]
    request.addfinalizer(
        lambda: [cleanup_vm(vmname, provider) for vmname in vm_names])

    provisioning_data = {
        'vm_name': vm_name,
        'host_name': {'name': [host]},
        'datastore_name': {'name': [datastore]},
        'num_vms': "2",
    }

    # Same thing, different names. :\
    if provider.type == 'rhevm':
        provisioning_data['provision_type'] = 'Native Clone'
    elif provider.type == 'virtualcenter':
        provisioning_data['provision_type'] = 'VMware'

    try:
        provisioning_data['vlan'] = provisioning['vlan']
    except KeyError:
        # provisioning['vlan'] is required for rhevm provisioning
        if provider.type == 'rhevm':
            raise pytest.fail('rhevm requires a vlan value in provisioning info')

    do_vm_provisioning(template, provider, vm_name, provisioning_data, request, smtp_test,
                       wait=False)
    wait_for(
        lambda:
        len(filter(
            lambda mail:
            "your request for a new vms was not autoapproved" in normalize_text(mail["subject"]),
            smtp_test.get_emails())) > 0,
        num_sec=90, delay=5)
    wait_for(
        lambda:
        len(filter(
            lambda mail:
            "virtual machine request was not approved" in normalize_text(mail["subject"]),
            smtp_test.get_emails())) > 0,
        num_sec=90, delay=5)

    cells = {'Description': 'Provision from [{}] to [{}###]'.format(template, vm_name)}
    wait_for(lambda: requests.go_to_request(cells), num_sec=80, delay=5)
    if edit:
        # Automatic approval after editing the request to conform
        with requests.edit_request(cells) as form:
            fill(form.num_vms, "1")
            new_vm_name = vm_name + "_xx"
            fill(form.vm_name, new_vm_name)
        vm_names = [new_vm_name]  # Will be just one now
        cells = {'Description': 'Provision from [{}] to [{}]'.format(template, new_vm_name)}
        check = "vm provisioned successfully"
        request.addfinalizer(
            lambda: cleanup_vm(new_vm_name, provider))
    else:
        # Manual approval
        requests.approve_request(cells, "Approved")
        vm_names = [vm_name + "001", vm_name + "002"]  # There will be two VMs
        request.addfinalizer(
            lambda: [cleanup_vm(vmname, provider) for vmname in vm_names])
        check = "request complete"
    wait_for(
        lambda:
        len(filter(
            lambda mail:
            "your virtual machine configuration was approved" in normalize_text(mail["subject"]),
            smtp_test.get_emails())) > 0,
        num_sec=120, delay=5)

    # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup
    logger.info(
        'Waiting for vms "{}" to appear on provider {}'.format(
            ", ".join(vm_names), provider.key))
    wait_for(
        lambda: all(map(provider.mgmt.does_vm_exist, vm_names)),
        handle_exception=True, num_sec=600)

    row, __ = wait_for(requests.wait_for_request, [cells],
                       fail_func=requests.reload, num_sec=1500, delay=20)
    assert normalize_text(row.last_message.text) == check

    # Wait for e-mails to appear
    def verify():
        return (
            len(filter(
                lambda mail:
                "your virtual machine request has completed vm {}".format(normalize_text(vm_name))
                in normalize_text(mail["subject"]),
                smtp_test.get_emails())) == len(vm_names)
        )

    wait_for(verify, message="email receive check", delay=5)
Exemple #37
0
if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    filenames = ['{}.json'.format(os.path.join(data_dir, name))
                 for name in event_names]
    # data = [data_[:10] for data_ in load_data(filenames).values()]
    data = load_data(filenames, event_names)

    # import IPython; IPython.embed()

    documents = []

    doc_id = 0
    for event_name, data_ in data.iteritems():
        for text in data_:
            text = normalize_text(text)
            # if 'euro2016' not in text:
            #     continue
            doc = document_type(text.split(), [doc_id], event=event_name)
            documents.append(doc)
            doc_id += 1

    logging.info('number of documents: %i', len(documents))
    random.shuffle(documents)
    # documents = documents[:1000]  # for testing

    model = train_model(documents, model_filename)
    with open(data_filename, 'wb') as f:
        cPickle.dump(documents, f)

    # import IPython; IPython.embed()
def test_provision_approval(provider_init, provider_key, provider_crud, provider_type,
                            provider_mgmt, provisioning, vm_name, smtp_test, request):
    """ Tests provisioning approval

    Metadata:
        test_flag: provision
        suite: infra_provisioning
    """
    # generate_tests makes sure these have values
    template, host, datastore = map(provisioning.get, ('template', 'host', 'datastore'))

    # It will provision two of them
    vm_names = [vm_name + "001", vm_name + "002"]
    request.addfinalizer(
        lambda: [cleanup_vm(vmname, provider_key, provider_mgmt) for vmname in vm_names])

    provisioning_data = {
        'vm_name': vm_name,
        'host_name': {'name': [host]},
        'datastore_name': {'name': [datastore]},
        'num_vms': "2",
    }

    # Same thing, different names. :\
    if provider_type == 'rhevm':
        provisioning_data['provision_type'] = 'Native Clone'
    elif provider_type == 'virtualcenter':
        provisioning_data['provision_type'] = 'VMware'

    try:
        provisioning_data['vlan'] = provisioning['vlan']
    except KeyError:
        # provisioning['vlan'] is required for rhevm provisioning
        if provider_type == 'rhevm':
            raise pytest.fail('rhevm requires a vlan value in provisioning info')

    do_vm_provisioning(template, provider_crud, vm_name, provisioning_data, request,
                       provider_mgmt, provider_key, smtp_test, wait=False)
    wait_for(
        lambda:
        len(filter(
            lambda mail:
            "your request for a new vms was not autoapproved" in normalize_text(mail["subject"]),
            smtp_test.get_emails())) > 0,
        num_sec=90, delay=5)
    wait_for(
        lambda:
        len(filter(
            lambda mail:
            "virtual machine request was not approved" in normalize_text(mail["subject"]),
            smtp_test.get_emails())) > 0,
        num_sec=90, delay=5)

    cells = {'Description': 'Provision from [{}] to [{}###]'.format(template, vm_name)}
    wait_for(lambda: requests.go_to_request(cells), num_sec=80, delay=5)
    requests.approve_request(cells, "Approved")
    wait_for(
        lambda:
        len(filter(
            lambda mail:
            "your virtual machine configuration was approved" in normalize_text(mail["subject"]),
            smtp_test.get_emails())) > 0,
        num_sec=90, delay=5)

    # Wait for the VM to appear on the provider backend before proceeding to ensure proper cleanup
    logger.info(
        'Waiting for vms "{}" to appear on provider {}'.format(
            ", ".join(vm_names), provider_crud.key))
    wait_for(
        lambda: all(map(provider_mgmt.does_vm_exist, vm_names)),
        handle_exception=True, num_sec=600)

    row, __ = wait_for(requests.wait_for_request, [cells],
                       fail_func=requests.reload, num_sec=1500, delay=20)
    assert normalize_text(row.last_message.text) == "request complete"

    # Wait for e-mails to appear
    def verify():
        return (
            len(filter(
                lambda mail:
                "your virtual machine request has completed vm {}".format(normalize_text(vm_name))
                in normalize_text(mail["subject"]),
                smtp_test.get_emails())) == 2
        )

    wait_for(verify, message="email receive check", delay=5)