コード例 #1
0
def gen_bigartm_operator(actualizable_bigartms, comboable_bigartms, name, description, number_of_topics, filters, regularization_params, wait_for_basic_tms,
                         is_actualizable=False, name_translit=None, topic_modelling_translit=None, is_comboable=True,
                         text_field="text_lemmatized"):
    from dags.bigartm.services.service import bigartm_calc

    if not name_translit:
        task_id = f"bigartm_calc_{name}"
    else:
        task_id = f"bigartm_calc_{topic_modelling_translit}_{name_translit}"
    filters_copy = filters.copy()
    bigartm_calc_operator = DjangoOperator(
        task_id=task_id,
        python_callable=bigartm_calc,
        op_kwargs={
            "name": name,
            "name_translit": name_translit,
            "text_field": text_field,
            "corpus": filters['corpus'],
            "corpus_datetime_ignore": filters.get('corpus_datetime_ignore', []),
            "source": filters['source'],
            "datetime_from": filters['datetime_from'],
            "datetime_to": filters['datetime_to'],
            "group_id": filters['group_id'] if 'group_id' in filters else None,
            "topic_weight_threshold": filters['topic_weight_threshold'] if 'topic_weight_threshold' in filters else 0.05,
            "is_ready": False,
            "description": description,
            "datetime_created": datetime.now(),
            "algorithm": "BigARTM",
            "meta_parameters": {

            },
            "number_of_topics": number_of_topics,
            "regularization_params": regularization_params,
            "is_actualizable": is_actualizable,
        }
    )
    if 'group_id' in filters and filters['group_id']:
        wait_for_basic_tms >> bigartm_calc_operator
    else:
        bigartm_calc_operator >> wait_for_basic_tms
    if is_actualizable:
        actualizable_bigartms.append(
            {
                "name": name,
                "name_translit": name_translit,
                "text_field": text_field,
                "regularization_params": regularization_params.copy(),
                "filters": filters_copy,
            }
        )
    if is_comboable:
        comboable_bigartms.append(
            {
                "name": name,
                "name_translit": name_translit,
                "text_field": text_field,
            }
        )
コード例 #2
0
def create_tasks(dict_name, source_field, min_document_frequency_relative, max_n_gram_len, corpus=None, concurrency=5):
    lemmatize_operators = []
    for i in range(concurrency):
        lemmatize_operators.append(DjangoOperator(
            task_id=f"ngramize_{dict_name}_{i}",
            python_callable=ngramize,
            op_kwargs={
                "dict_name": dict_name,
                "source_field": source_field,
                "max_n_gram_len": max_n_gram_len,
                "min_document_frequency_relative": min_document_frequency_relative,
                "process_num": i,
                "total_proc": concurrency,
                "corpus": corpus,
            }
        ))
コード例 #3
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 90,
    'pool': 'short_tasks'
}

dag = DAG('NLPmonitor_get_topics_info',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval='0 23 * * *')

topic_info_getters = []
with dag:
    for tm in actualizable_bigartms:
        topic_info_getter = DjangoOperator(
            task_id=
            f"get_topics_info_{tm['name'] if not tm['name_translit'] else tm['name_translit']}",
            python_callable=calc_topics_info,
            op_kwargs={
                "corpus":
                tm["filters"]['corpus'],
                "topic_modelling_name":
                tm['name'],
                "topic_weight_threshold":
                tm["filters"]['topic_weight_threshold']
                if 'topic_weight_threshold' in tm["filters"] else None,
            })
        topic_info_getters.append(topic_info_getter)
コード例 #4
0
from DjangoOperator import DjangoOperator
from datetime import datetime, timedelta

from dags.pre_caching.services.service import pre_cache


default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 12, 24),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=15),
    'priority_weight': 25,
    'pool': 'short_tasks'
    # 'queue': 'bash_queue',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('Nlpmonitor_pre_cache', catchup=False, max_active_runs=1, default_args=default_args, schedule_interval='0 * * * *')


with dag:
    send_elastic = DjangoOperator(
        task_id="pre_cache_dashboard",
        python_callable=pre_cache,
    )
コード例 #5
0
from dags.criterion_eval.init_criterions.service import init_criterions

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 11, 14),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=15),
    'priority_weight': 95,
    'pool': 'short_tasks'
    # 'queue': 'bash_queue',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('Criterion_init_var',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval='0 12 * * *')

with dag:
    init_sources = DjangoOperator(
        task_id="init_critetions",
        python_callable=init_criterions,
    )
コード例 #6
0
from dags.eval_dicts.services.calc_dict import calc_eval_dicts

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 12, 12),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 90,
    'pool': 'short_tasks'
}

dag = DAG('Generate_eval_dicts',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval=None)

with dag:

    evaluator = DjangoOperator(task_id=f"calc_eval_dicts",
                               python_callable=calc_eval_dicts,
                               op_kwargs={
                                   "topic_modellings_list":
                                   ("bigartm_two_years", ),
                               })
コード例 #7
0
ファイル: scrap.py プロジェクト: KindYAK/NLPMonitor-DAGs
scrapers_low = []
scrapers_medium = []
scrapers_high = []
# Scraping by accounts
for social_network in networks:
    # Low
    with dag_low:
        scraper = DjangoOperator(
            task_id=f"scrap_{social_network['name']}_by_account_low",
            python_callable=scrap_wrapper,
            op_kwargs={
                "social_network":
                social_network['id'],
                "accounts":
                list(
                    filter(
                        lambda x:
                        (x['social_network'] == social_network['id']) and
                        (x['priority_rate'] <= 25), accounts)),
                "by":
                "account",
            })
    scrapers_low.append(scraper)

    # Medium
    with dag_medium:
        scraper = DjangoOperator(
            task_id=f"scrap_{social_network['name']}_by_account_medium",
            python_callable=scrap_wrapper,
            op_kwargs={
コード例 #8
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 40,
    'pool': 'short_tasks',
}

dag = DAG('NLPMonitor_es_activity_update', catchup=False, max_active_runs=1, concurrency=10, default_args=default_args, schedule_interval='15 6 * * 5')

indices = json.loads(Variable.get('indices_update_activity', default_var="[]"))

with dag:
    init_update_datetime = DjangoOperator(
            task_id=f"init_update_datetime",
            python_callable=init_update_datetime,
        )
    updaters = []
    for index in indices:
        updaters.append(DjangoOperator(
            task_id=f"update_{index['name_translit']}",
            python_callable=es_update,
            op_kwargs={
                "index": index['name'],
            }
        )
        )
    set_update_datetime = DjangoOperator(
            task_id=f"set_update_datetime",
            python_callable=set_update_datetime,
        )
コード例 #9
0
    dictionary_filters = {
        "dictionary": "default_dict_pymorphy_2_4_393442_3710985",
        "document_normal_frequency__gte": 100,
        "document_normal_frequency__lte": 500000,
        "is_stop_word": False,
        # "is_in_pymorphy2_dict": True,
        # "is_multiple_normals_in_pymorphy2": False,
    }
    max_dict_size = 30000
    generate_cooccurrence_codistance = DjangoOperator(
        task_id="generate_cooccurrence_codistance",
        python_callable=generate_cooccurrence_codistance,
        op_kwargs={
            "name": "test",
            "dictionary_filters": dictionary_filters,
            "max_dict_size": max_dict_size,
            "document_filters": {
                "corpus": "main",
                # "source": "https://kapital.kz/",
                "datetime__gte": date(1950, 1, 1),
                "datetime__lte": date(2050, 1, 1),
            },
        })

    topic_modelling_operator = DjangoOperator(
        task_id="topic_modelling",
        python_callable=topic_modelling,
        op_kwargs={
            "name": "test",
            "d1":
            1.75,  # Максимальное допустимое расстояние между всеми возможными попарными комбинациями объектов в составе формируемых сгустков (нечётких протокластеров)
            "d2":
コード例 #10
0
from DjangoOperator import DjangoOperator
from datetime import datetime, timedelta

from dags.es_activity_update.init_indices.service import init_indices

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 5, 22),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=15),
    'priority_weight': 95,
    'pool': 'short_tasks'
    # 'queue': 'bash_queue',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('NLPMonitor_es_activity_update_init_indices',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval='0 6 * * *')

with dag:
    init_sources = DjangoOperator(task_id="init_indices",
                                  python_callable=init_indices)
コード例 #11
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 5, 3),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'priority_weight': 20,
    'pool': 'long_tasks',
}


dag = DAG(
    'Nlpmonitor_generate_rubert_embeddings', catchup=False, max_active_runs=1,
    default_args=default_args, schedule_interval=None
)

with dag:
    # Word
    init_word_index = DjangoOperator(
        task_id="test_connections_to_bert_service",
        python_callable=test_connections_to_bert_service,
        pool="short_tasks",
        op_kwargs={
            "created": datetime.now(),
        }
    )
コード例 #12
0
    'retries': 0,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 90,
    'pool': 'short_tasks'
}

dag = DAG('Calc_mma_eval', catchup=False, max_active_runs=1, default_args=default_args, schedule_interval=None)

mmas = []
with dag:
    mmas.append(DjangoOperator(
        task_id=f"calculate_mma",
        python_callable=calc_mma,
        op_kwargs={
            "topic_modelling_name": "bigartm_two_years_main_and_gos2",
            "criterion_ids": (1, 35, 34),
            "criterion_weights": ((0.44, 0.33, 0.23), ),
            "class_ids": (36, ),
            "perform_actualize": False
        }
    ))

    mmas.append(DjangoOperator(
        task_id=f"calculate_mma_surveys",
        python_callable=calc_mma,
        op_kwargs={
            "topic_modelling_name": "bigartm_two_years_main_and_gos2",
            "criterion_ids": (1, 35, 34, 37), # Тональность (негатив!!), Резонансность, Гос. программы, Опросы
            "criterion_weights": (
                (0.44, 0.33, 0, 0.23),
                (0, 0.2, 0.4, 0.4),
コード例 #13
0
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 90,
    'pool': 'short_tasks'
}

dag = DAG('Criterion_actualize_evaluations', catchup=False, max_active_runs=1, default_args=default_args, schedule_interval='0 23 * * *')
dag_fast = DAG('Criterion_actualize_evaluations_fast', catchup=False, max_active_runs=1, default_args=default_args, schedule_interval='55 * * * *')

actualizers_evaluators = []
with dag:
    for eval in actualizable_criterion_evals:
        evaluator = DjangoOperator(
            task_id=f"eval_actualize_{eval['criterion_name']}_{eval['topic_modelling_translit']}",
            python_callable=evaluate,
            op_kwargs={
                "perform_actualize": True,
                "criterion_id": eval["criterion_id"],
                "topic_modelling": eval["topic_modelling"],
            }
        )
        actualizers_evaluators.append(evaluator)
        if 'calc_virt_negative' in eval:
            evaluator = DjangoOperator(
                task_id=f"eval_actualize_{eval['criterion_name']}_{eval['topic_modelling_translit']}_neg",
                python_callable=evaluate,
                op_kwargs={
                    "perform_actualize": True,
                    "criterion_id": eval["criterion_id"],
                    "topic_modelling": eval["topic_modelling"],
                    "calc_virt_negative": True,
                }
コード例 #14
0
"""
Code that goes along with the Airflow tutorial located at:
https://github.com/apache/airflow/blob/master/airflow/example_dags/tutorial.py
"""
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonVirtualenvOperator, PythonOperator
from DjangoOperator import DjangoOperator
from datetime import datetime, timedelta

from dags.examples.external_file_example.es_io import es_etl

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 7, 25),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'pool': 'short_tasks',
}

dag = DAG('Example_es_io', default_args=default_args, schedule_interval=None)

with dag:
    django_op = DjangoOperator(task_id="ES_ETL",
                               python_callable=es_etl,
                               op_kwargs={"stuff": "stuff))"})
コード例 #15
0
from DjangoOperator import DjangoOperator
from datetime import datetime, timedelta

from dags.scraper_social.init_accounts.service import init_accounts

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 9, 25),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=15),
    'priority_weight': 95,
    'pool': 'scraper_tasks',
    # 'queue': 'second',
}

dag = DAG('Scrapers_init_social_accounts',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval='0 12 * * *')

with dag:
    init_sources = DjangoOperator(
        task_id="init_accounts",
        python_callable=init_accounts,
    )
コード例 #16
0
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 5, 18),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 40,
    'pool': 'scraper_tasks',
    # 'queue': 'second',
}

dag = DAG('Scrapers_update_activity',
          catchup=False,
          max_active_runs=2,
          default_args=default_args,
          schedule_interval='0 0 * * *')

with dag:
    scrapers = []
    concurrency = 4
    for i in range(concurrency):
        scrapers.append(
            DjangoOperator(task_id=f"scrap_{i}",
                           python_callable=update,
                           op_kwargs={
                               "start": (100 / concurrency) * i,
                               "end": (100 / concurrency) * (i + 1)
                           }))
コード例 #17
0
 for tm in actualizable_bigartms:
     bigartm_calc_operator = DjangoOperator(
         task_id=
         f"bigartm_actualize_{tm['name'] if not tm['name_translit'] else tm['name_translit']}",
         python_callable=bigartm_calc,
         op_kwargs={
             "perform_actualize":
             True,
             "name":
             tm['name'],
             "name_translit":
             tm['name_translit'],
             "text_field":
             tm['text_field'],
             "corpus":
             tm["filters"]['corpus'],
             "datetime_from":
             tm["filters"]['datetime_from'],
             "datetime_to":
             tm["filters"]['datetime_to'],
             "source":
             tm["filters"]['source'],
             "group_id":
             tm["filters"]['group_id']
             if 'group_id' in tm["filters"] else None,
             "topic_weight_threshold":
             tm["filters"]['topic_weight_threshold']
             if 'topic_weight_threshold' in tm["filters"] else 0.05,
             "regularization_params":
             tm["regularization_params"],
         })
     actualizers_calcs.append(bigartm_calc_operator)
コード例 #18
0
from DjangoOperator import DjangoOperator
from datetime import datetime, timedelta

from dags.get_proxy_list.services.service_proxy import get_proxy_list

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 2, 4),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
    'priority_weight': 50,
    'pool': 'scraper_tasks',
    'execution_timeout': timedelta(hours=1),
    # 'queue': 'second',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('Scrapers_get_proxy_list', catchup=False, max_active_runs=1, default_args=default_args, schedule_interval='0 12 * * *')

with dag:
    proxy_op = DjangoOperator(
        task_id="get_proxy_list",
        python_callable=get_proxy_list,
        execution_timeout=timedelta(hours=1)
    )
コード例 #19
0
from dags.astana_test.external_file_example.my_package import test

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 2, 11),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'pool': 'short_tasks',
}

dag = DAG('Astana_test',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval=None)

with dag:
    simple_op = PythonOperator(task_id="test_simple",
                               python_callable=lambda: "Hello, NurSultan!",
                               queue='second')

    django_op = DjangoOperator(task_id="test_django",
                               python_callable=test,
                               queue='second')
    simple_op >> django_op
コード例 #20
0
from dags.document_location.services.get_locations import get_locations

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 12, 12),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 90,
    'pool': 'short_tasks'
}

dag = DAG('Generate_document_locations',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval=None)

with dag:

    evaluator = DjangoOperator(task_id=f"document_locations",
                               python_callable=get_locations,
                               op_kwargs={
                                   "criterion_tm_duos":
                                   (("bigartm_two_years", 1), ),
                               })
コード例 #21
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 40,
    'pool': 'scraper_tasks',
    # 'queue': 'second',
}

dag = DAG('Scrapers_scrap_per_url', catchup=False, max_active_runs=2, default_args=default_args, schedule_interval=None)

# os.path.join(BASE_DAG_DIR, "tmp", f"urls_{source_id}.txt")
with dag:
    scraper1 = DjangoOperator(
        task_id=f"scrap_svoboda",
        python_callable=scrap,
        op_kwargs={
            "source_id": 67,
        }
    )

    scraper2 = DjangoOperator(
        task_id=f"scrap_rt",
        python_callable=scrap,
        op_kwargs={
            "source_id": 60,
        }
    )

    scraper3 = DjangoOperator(
        task_id=f"scrap_sputnik",
        python_callable=scrap,
コード例 #22
0
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('ML_cluster_sources',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval=None)  # '15 22 * * *'

with dag:
    cluster_operators = []
    cluster_operators.append(
        DjangoOperator(task_id=f"cluster_rus",
                       python_callable=run_cluster,
                       op_kwargs={
                           "tm_name":
                           "bigartm_two_years_rus_and_rus_propaganda",
                           "eps_range": [i / 10 for i in range(1, 11)],
                           "min_samples_range": range(1, 10),
                       }))

    cluster_operators.append(
        DjangoOperator(task_id=f"cluster_rus_kz",
                       python_callable=run_cluster,
                       op_kwargs={
                           "tm_name": "bigartm_two_years_rus_and_main",
                           "eps_range": [i / 10 for i in range(1, 11)],
                           "min_samples_range": range(1, 10),
                       }))

    cluster_operators.append(
        DjangoOperator(task_id=f"cluster_kz",
コード例 #23
0
     list(
         filter(
             lambda x: x.isalnum() or x in ['.', '-', '_'],
             criterion['name_translit'].replace(":", "_").replace(
                 " ", "_"))))
 filtered_topic_modelling = "".join(
     list(
         filter(
             lambda x: x.isalnum() or x in ['.', '-', '_'],
             tm['name_translit'].replace(":",
                                         "_").replace(" ", "_"))))
 evaluators.append(
     DjangoOperator(
         task_id=
         f"eval_{filtered_criterion_name}_{filtered_topic_modelling}",
         python_callable=evaluate,
         op_kwargs={
             "criterion_id": criterion['id'],
             "topic_modelling": tm['name'],
         }))
 actualizable_criterion_evals.append({
     "criterion_id":
     criterion['id'],
     "criterion_name":
     filtered_criterion_name,
     "topic_modelling":
     tm['name'],
     "topic_modelling_translit":
     filtered_topic_modelling,
 })
 if criterion['calc_virt_negative']:
     evaluators.append(
コード例 #24
0
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 4, 14),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
    'priority_weight': 40,
    'pool': 'short_tasks'
}

dag = DAG('NLPmonitor_TM_Combo_Finder',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval=None)

combo_finders = []
with dag:
    for tm in comboable_bigartms:
        bigartm_calc_operator = DjangoOperator(
            task_id=
            f"tm_combo_{tm['name'] if not tm['name_translit'] else tm['name_translit']}",
            python_callable=find_combos,
            op_kwargs={
                "name": tm['name'],
                "name_translit": tm['name_translit'],
            })
        combo_finders.append(bigartm_calc_operator)
コード例 #25
0
"""
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonVirtualenvOperator, PythonOperator
from DjangoOperator import DjangoOperator
from datetime import datetime, timedelta

from dags.examples.external_file_example.my_package import test

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 7, 25),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'pool': 'short_tasks',
}

dag = DAG('Example_django_op_example',
          default_args=default_args,
          schedule_interval=None)

with dag:
    django_op = DjangoOperator(
        task_id="test_corpus_create",
        python_callable=test,
    )
コード例 #26
0
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('Nlpmonitor_Lemmatization_eng',
          catchup=False,
          max_active_runs=1,
          concurrency=4,
          default_args=default_args,
          schedule_interval='20 * * * *')

with dag:
    # init_last_datetime = DjangoOperator(
    #     task_id="init_last_datetime",
    #     python_callable=init_last_datetime,
    #     op_kwargs={
    #     }
    # )

    concurrency = 4
    lemmatize_operators = []
    for i in range(concurrency):
        lemmatize_operators.append(
            DjangoOperator(task_id=f"lemmatize_{i}",
                           python_callable=preprocessing_raw_data,
                           op_kwargs={
                               "process_num": i,
                               "total_proc": concurrency,
                           }))
    # init_last_datetime >> lemmatize_operators
コード例 #27
0
    'pool': 'short_tasks'
    # 'queue': 'bash_queue',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('Nlpmonitor_Lemmatization_kz',
          catchup=False,
          max_active_runs=1,
          concurrency=5,
          default_args=default_args,
          schedule_interval='20 * * * *')

with dag:
    init_last_datetime = DjangoOperator(
        task_id="init_last_datetime",
        python_callable=init_last_datetime,
    )

    concurrency = 3
    lemmatize_operators = []
    for i in range(concurrency):
        lemmatize_operators.append(
            DjangoOperator(task_id=f"lemmatize_{i}",
                           python_callable=preprocessing_raw_data,
                           op_kwargs={
                               "start": (100 / concurrency) * i,
                               "end": (100 / concurrency) * (i + 1)
                           }))
    init_last_datetime >> lemmatize_operators
コード例 #28
0
          schedule_interval=None)

with dag:
    corpuses = ["scopus_real_real"]
    # name = "kz_rus_ngrams_dict_pymorphy_2_4_393442_3710985"
    # name = "kz_rus_yandex_ngrams_dict"
    # name = "en_lemminflect"
    name = "en_scopus_extend"
    max_n_gram_len = 3
    field_to_parse = "text_lemmatized_eng_lemminflect"

    init_dictionary_index = DjangoOperator(
        task_id="init_dictionary_index",
        python_callable=init_dictionary_index,
        op_kwargs={
            "corpuses": corpuses,
            "name": name,
            "datetime": datetime.now(),
            "max_n_gram_len": max_n_gram_len,
            "field_to_parse": field_to_parse,
        })

    concurrency = 150
    dictionary_operators = []
    for i in range(concurrency):
        dictionary_operators.append(
            DjangoOperator(task_id=f"dictionary_{i}",
                           python_callable=generate_dictionary_batch,
                           op_kwargs={
                               "name": name,
                               "process_num": i,
                               "total_proc": concurrency,
コード例 #29
0
from dags.elastic_sender.sender.service import send_elastic

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 9, 12),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=15),
    'priority_weight': 80,
    'pool': 'short_tasks'
    # 'queue': 'bash_queue',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('Nlpmonitor_send_elastic',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval='15 * * * *')

with dag:
    send_elastic = DjangoOperator(
        task_id="send_elastic",
        python_callable=send_elastic,
    )
コード例 #30
0
from dags.scraper.init_sources.service import init_sources

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 9, 4),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=15),
    'priority_weight': 95,
    'pool': 'scraper_tasks',
    # 'queue': 'second',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('Scrapers_init_sources',
          catchup=False,
          max_active_runs=1,
          default_args=default_args,
          schedule_interval='0 12 * * *')

with dag:
    init_sources = DjangoOperator(task_id="init_sources",
                                  python_callable=init_sources,
                                  op_kwargs={
                                      "sources_full": {70},
                                  })