Example #1
0
    def test_exclude_fields_parameter(self):
        c1 = Company.objects.create(name="Foo Products, Ltd.")
        c2 = Company.objects.create(name="Bar Microcontrollers, Inc.")

        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        e2 = Employee.objects.create(name="Isaac", age=9, company=c2)

        # We should update Scott's age, and not touch company.
        new_objs = [Employee(name="Scott", age=41, company=c1), Employee(name="Isaac", age=9, company=c1)]

        with self.assertRaises(FieldDoesNotExist):
            # Crashes because we attempted to exclude a field that does not exist
            bulk_sync(new_models=new_objs, filters=None, key_fields=("name",), exclude_fields=["missing_field"])

        ret = bulk_sync(new_models=new_objs, filters=None, key_fields=("name",), exclude_fields=["company"])

        new_e1 = Employee.objects.get(id=e1.id)
        self.assertEqual("Scott", new_e1.name)
        self.assertEqual(41, new_e1.age)
        self.assertEqual(c1, new_e1.company)

        new_e2 = Employee.objects.get(id=e2.id)
        self.assertEqual("Isaac", new_e2.name)
        self.assertEqual(9, new_e2.age)
        self.assertEqual(c2, new_e2.company)

        self.assertEqual(2, ret["stats"]["updated"])
        self.assertEqual(0, ret["stats"]["created"])
        self.assertEqual(0, ret["stats"]["deleted"])
Example #2
0
    def test_provided_pk_is_retained_but_raises_if_mismatch_with_keyfield(
            self):
        c1 = Company.objects.create(name="Foo Products, Ltd.")
        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        new_objs = [Employee(id=e1.id, name="Notscott", age=41, company=c1)]

        with self.assertRaises(IntegrityError):
            # Crashes because e1.id already exists in database, even though 'name' doesnt match so it tries to INSERT.
            ret = bulk_sync(new_models=new_objs,
                            filters=Q(company_id=c1.id),
                            key_fields=("name", ))

        unique_pk = Employee.objects.values_list(
            "id", flat=True).order_by("-id").first() + 1
        new_objs = [
            Employee(id=unique_pk, name="Notscott", age=41, company=c1)
        ]
        ret = bulk_sync(new_models=new_objs,
                        filters=Q(company_id=c1.id),
                        key_fields=("name", ))

        self.assertEqual(0, ret["stats"]["updated"])
        self.assertEqual(1, ret["stats"]["created"])  # Added 'Notscott'
        self.assertEqual(1, ret["stats"]["deleted"])  # Deleted 'Scott'

        # Make sure we retained the PK
        self.assertEqual(Employee.objects.filter(id=unique_pk).count(), 1)
Example #3
0
    def test_empty_new_models_class_detection_works(self):
        c1 = Company.objects.create(name="My Company LLC")

        with self.assertRaises(RuntimeError):
            ret = bulk_sync(new_models=[], filters=None, key_fields=("name",))

        ret = bulk_sync(new_models=[], filters=None, key_fields=("name",), db_class=Employee)
        ret = bulk_sync(new_models=Employee.objects.none(), filters=None, key_fields=("name",))
    def get_stats(self):
        """
        Загружает статистику Admitad в базу за макс 365 дней или с последнего необработанного действия или с последнейго действия
        :return:
        """
        date_to = datetime.now(
            pytz.timezone('Europe/Moscow')).date() - timedelta(
                days=1)  # вчера по московскому времени
        # min_date = date(day=5, month=2,
        #                 year=2020)  # дата с которой собираются данные адмитад. До этого неправильно фиксировался subid
        date_from = AdmitadAction.get_stats_date_from()
        if not date_from:
            date_from = date(day=1, month=1,
                             year=2020)  # получаем действия за все время

        # действия будут получены только для существующих программ
        program_ids = set(AdmitadProgram.objects.values_list('id', flat=True))
        actions = []
        positions = []
        offset = 0
        # Загружаем данные блоками
        while True:
            # загружаем статистику включая данный в date_from и date_to
            res = self.client.StatisticActions.get(
                offset=offset,
                limit=500,
                date_start=date_from.strftime("%d.%m.%Y"),
                date_end=date_to.strftime("%d.%m.%Y"))

            for item in res['results']:
                action = self.action_from_api_item(item)
                # если программа еще существует
                if action.program_id in program_ids:
                    actions.append(action)
                    positions.extend(self.positions_from_api_item(item))
            offset += 500
            if res['_meta'][
                    'count'] < res['_meta']['limit'] + res['_meta']['offset']:
                break
        filter = Q(action_time__gte=date_from) if date_from else Q(
            pk__isnull=False)
        bulk_sync(new_models=actions, key_fields=['id'],
                  filters=filter)  # сохраняем действия в базу
        bulk_sync(
            new_models=positions,
            key_fields=['id'],
            filters=Q(action__in=actions))  # сохраняем позиции в действии
        result_str = f"Admitad stats collected {len(actions)} actions"
        logging.info(result_str)
        return result_str
Example #5
0
    def test_skip_deletes(self):
        c1 = Company.objects.create(name="My Company LLC")

        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        e2 = Employee.objects.create(name="Isaac", age=9, company=c1)

        # update Scott - this makes Isaac is the "stale object" that would be deleted if skip_deletes were False
        new_objs = [
            Employee(name="Scott", age=41, company=c1),
        ]

        # but Isaac should remain when the skip_deletes flag is True
        ret = bulk_sync(new_models=new_objs,
                        filters=None,
                        key_fields=("name", ),
                        skip_deletes=True)

        self.assertEqual(
            ["Scott", "Isaac"],
            [x.name for x in Employee.objects.all().order_by('id')])

        new_e1 = Employee.objects.get(id=e1.id)
        self.assertEqual(41, new_e1.age)

        self.assertEqual(2, Employee.objects.count())

        self.assertEqual(1, ret["stats"]["updated"])
        self.assertEqual(0, ret["stats"]["created"])
        self.assertEqual(0, ret["stats"]["deleted"])
Example #6
0
def sync_dfcs():
    scryfall_query = "https://api.scryfall.com/cards/search?q=is:dfc%20-layout:art_series%20-layout:double_faced_token"
    response = json.loads(requests.get(scryfall_query).content)

    # maintain list of all dfcs found so far
    q_dfcpairs = []

    for x in response['data']:
        # retrieve front and back names for this card, then create a DFCPair for it and append to list
        front_name = x['card_faces'][0]['name']
        back_name = x['card_faces'][1]['name']
        q_dfcpairs.append(
            DFCPair(front=to_searchable(front_name),
                    back=to_searchable(back_name)))

    # synchronise the located DFCPairs to database
    t0 = time.time()
    key_fields = ('front', )
    ret = bulk_sync(new_models=q_dfcpairs,
                    key_fields=key_fields,
                    filters=None,
                    db_class=DFCPair)

    print(
        "Finished synchronising database with Scryfall DFCs, which took {} seconds."
        .format(time.time() - t0))
Example #7
0
    def test_skip_updates(self):
        c1 = Company.objects.create(name="My Company LLC")

        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        e2 = Employee.objects.create(name="Isaac", age=9, company=c1)

        # update employee that will be ignored, create a new one
        new_objs = [
            Employee(name="Scott", age=100, company=c1),
            Employee(name="Alice", age=36, company=c1)
        ]

        ret = bulk_sync(new_models=new_objs,
                        filters=None,
                        key_fields=("name", ),
                        skip_updates=True)

        # the age should not have been updated
        new_e1 = Employee.objects.get(id=e1.id)
        self.assertEqual(40, new_e1.age)

        # Isaac is "stale" object - was deleted, Alice was created
        self.assertEqual(2, Employee.objects.count())
        self.assertEqual(
            ["Scott", "Alice"],
            [x.name for x in Employee.objects.all().order_by('id')])

        self.assertEqual(0, ret["stats"]["updated"])
        self.assertEqual(1, ret["stats"]["created"])
        self.assertEqual(1, ret["stats"]["deleted"])
Example #8
0
    def test_fields_parameter(self):
        c1 = Company.objects.create(name="Foo Products, Ltd.")
        c2 = Company.objects.create(name="Bar Microcontrollers, Inc.")

        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        e2 = Employee.objects.create(name="Isaac", age=9, company=c2)

        # We should update Scott's age, and not touch company.
        new_objs = [
            Employee(name="Scott", age=41, company=c1),
            Employee(name="Isaac", age=9, company=c1),
        ]

        ret = bulk_sync(new_models=new_objs,
                        filters=None,
                        key_fields=("name", ),
                        fields=['age'])

        new_e1 = Employee.objects.get(id=e1.id)
        self.assertEqual("Scott", new_e1.name)
        self.assertEqual(41, new_e1.age)
        self.assertEqual(c1, new_e1.company)

        new_e2 = Employee.objects.get(id=e2.id)
        self.assertEqual("Isaac", new_e2.name)
        self.assertEqual(9, new_e2.age)
        self.assertEqual(c2, new_e2.company)

        self.assertEqual(2, ret["stats"]["updated"])
        self.assertEqual(0, ret["stats"]["created"])
        self.assertEqual(0, ret["stats"]["deleted"])
def sync_sources(sources):
    key_fields = ('id', )
    ret = bulk_sync(
        new_models=sources,
        key_fields=key_fields,
        filters=None,
        db_class=Source
    )
Example #10
0
    def test_pk_set_but_keyfield_changes_ignores_pk(self):
        c1 = Company.objects.create(name="Foo Products, Ltd.")
        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        new_objs = [Employee(id=e1.id, name="Notscott", age=41, company=c1)]

        ret = bulk_sync(new_models=new_objs,
                        filters=Q(company_id=c1.id),
                        key_fields=("name", ))
Example #11
0
    def test_all_features_at_once(self):
        c1 = Company.objects.create(name="Foo Products, Ltd.")
        c2 = Company.objects.create(name="Bar Microcontrollers, Inc.")

        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        e2 = Employee.objects.create(name="Isaac", age=9, company=c1)
        e3 = Employee.objects.create(name="Zoe", age=9, company=c1)
        e4 = Employee.objects.create(name="Bob", age=25, company=c2)

        # We should update Scott's and Isaac's age, delete Zoe, add Newguy and
        # add a second Bob (since he's not in company c1, which we filtered on.)
        new_objs = [
            Employee(name="Scott", age=41, company=c1),
            Employee(name="Isaac", age=9, company=c1),
            Employee(name="Newguy", age=10, company=c1),
            Employee(name="Bob", age=50, company=c1),
        ]

        ret = bulk_sync(new_models=new_objs,
                        filters=Q(company_id=c1.id),
                        key_fields=("name", ))

        self.assertEqual(2, ret["stats"]["updated"])
        self.assertEqual(2, ret["stats"]["created"])
        self.assertEqual(1, ret["stats"]["deleted"])

        self.assertEqual(4, Employee.objects.filter(company=c1).count())
        self.assertEqual(1, Employee.objects.filter(company=c2).count())

        new_e1 = Employee.objects.get(id=e1.id)
        self.assertEqual("Scott", new_e1.name)
        self.assertEqual(41, new_e1.age)
        self.assertEqual(c1, new_e1.company)

        new_e2 = Employee.objects.get(id=e2.id)
        self.assertEqual("Isaac", new_e2.name)
        self.assertEqual(9, new_e2.age)
        self.assertEqual(c1, new_e2.company)

        with self.assertRaises(Employee.DoesNotExist):
            Employee.objects.get(id=e3.id)

        new_e4 = Employee.objects.get(id=e4.id)
        self.assertEqual("Bob", new_e4.name)
        self.assertEqual(25, new_e4.age)
        self.assertEqual(c2, new_e4.company)

        new_e3 = Employee.objects.get(name="Newguy")
        self.assertEqual("Newguy", new_e3.name)
        self.assertEqual(10, new_e3.age)
        self.assertEqual(c1, new_e3.company)

        new_e5 = Employee.objects.get(name="Bob", company=c1)
        self.assertEqual("Bob", new_e5.name)
        self.assertEqual(50, new_e5.age)
        self.assertEqual(c1, new_e5.company)
Example #12
0
def update_aqi():
    dataframe = pd.read_csv(AQI_DATA_URL,
                            delimiter='|',
                            usecols=range(17),
                            names=AQI_DATA_COLUMN_NAMES,
                            header=None,
                            parse_dates=AQI_DATA_DATES,
                            infer_datetime_format=True,
                            dayfirst=False)

    reporting_areas = [
        AirNowReportingArea(name=area[0],
                            state_code=area[1],
                            location=Point(float(area[3]), float(area[2])))
        for (area, _) in dataframe.groupby(
            ['ReportingArea', 'StateCode', 'Latitude', 'Longitude'])
    ]

    _print_status(bulk_sync(reporting_areas, ('name', 'state_code'), None))

    forecast_sources = [
        AirNowForecastSource(name=name)
        for (name, _) in dataframe.groupby('ForecastSource')
    ]
    _print_status(bulk_sync(forecast_sources, ('name', ), None))

    reporting_areas_db = list(AirNowReportingArea.objects.all())
    forecast_sources_db = list(AirNowForecastSource.objects.all())

    observations = [
        _convert_observation(o, reporting_areas_db, forecast_sources_db)
        for (_, o) in dataframe.iterrows() if o['StateCode'] != '  '
    ]
    observations = [
        observation for observation in observations if observation is not None
    ]
    _print_status(
        bulk_sync(observations,
                  ('reporting_area_id', 'issued_date', 'valid_date',
                   'record_sequence', 'parameter_name', 'aqi_value',
                   'aqi_category', 'primary_pollutant', 'type'), None))
Example #13
0
 def get_programs(self):
     """Загружаем партнерские программы и тарифы"""
     r = requests.get(self.offers_url)
     result = r.json()['data']
     for offer_data in result:
         program, _ = AdvCakeProgram.objects.update_or_create(id=offer_data['id'],
                                                              defaults={'name': offer_data['name']})
         # загружаем тарифы
         tariffs = []
         for tariff_data in offer_data['bids']:
             tariff = Tariff(id=tariff_data['id'],
                             size=tariff_data['value'],
                             name=tariff_data['text'],
                             is_percentage=tariff_data['type'] == 'percent',
                             program=program)
             if tariff.is_percentage:
                 tariff.size /= 100
             else:
                 # переводим в копейки
                 tariff.size = round(tariff.size * 100)
             tariffs.append(tariff)
         bulk_sync(tariffs, key_fields=['pk'], filters=Q(program=program))
Example #14
0
def sync_dfcs():
    scryfall_query_dfc = "https://api.scryfall.com/cards/search?q=is:dfc%20-layout:art_series%20-layout:double_faced_token"
    response_dfc = json.loads(requests.get(scryfall_query_dfc).content)

    data = response_dfc["data"]
    while response_dfc["has_more"]:
        response_dfc = json.loads(
            requests.get(response_dfc["next_page"]).content)
        data += response_dfc["data"]

    # maintain list of all dfcs found so far
    q_dfcpairs = []

    for x in data:
        # retrieve front and back names for this card, then create a DFCPair for it and append to list
        front_name = x["card_faces"][0]["name"]
        back_name = x["card_faces"][1]["name"]
        q_dfcpairs.append(
            DFCPair(front=to_searchable(front_name),
                    back=to_searchable(back_name)))

    # also retrieve meld pairs and save them as DFCPairs
    time.sleep(0.1)
    scryfall_query_meld = "https://api.scryfall.com/cards/search?q=is:meld%"
    response_meld = json.loads(requests.get(scryfall_query_meld).content)

    for x in response_meld["data"]:
        card_part = [y for y in x["all_parts"] if y["name"] == x["name"]][0]
        meld_result = [
            y for y in x["all_parts"] if y["component"] == "meld_result"
        ][0]["name"]
        if card_part["component"] == "meld_part":
            is_top = "\n(Melds with " not in x["oracle_text"]
            card_bit = "Top" if is_top else "Bottom"
            q_dfcpairs.append(
                DFCPair(
                    front=to_searchable(x["name"]),
                    back=to_searchable(f"{meld_result} {card_bit}"),
                ))

    # synchronise the located DFCPairs to database
    t0 = time.time()
    key_fields = ("front", )
    ret = bulk_sync(new_models=q_dfcpairs,
                    key_fields=key_fields,
                    filters=None,
                    db_class=DFCPair)

    print(
        "Finished synchronising database with Scryfall DFCs, which took {} seconds."
        .format(time.time() - t0))
Example #15
0
    def test_select_for_update_of_pk(self):
        self.c1 = Company.objects.create(name="Foo Products, Ltd.")
        self.c2 = Company.objects.create(name="Bar Microcontrollers, Inc.")

        self.e1 = EmployeeWithOffice.objects.create(name="Scott",
                                                    age=40,
                                                    company=self.c1)
        self.e2 = EmployeeWithOffice.objects.create(name="Isaac",
                                                    age=9,
                                                    company=self.c1)
        self.e3 = EmployeeWithOffice.objects.create(name="Zoe",
                                                    age=9,
                                                    company=self.c1)
        self.e4 = EmployeeWithOffice.objects.create(name="Bob",
                                                    age=25,
                                                    company=self.c2)
        self.e5 = EmployeeWithOffice.objects.create(name="Newguy",
                                                    age=55,
                                                    company=self.c2)

        self.o1 = Office.objects.create(id="office1")
        self.o2 = Office.objects.create(id="office2")
        self.o3 = Office.objects.create(id="office3")

        self.eo1 = EmployeeOffice.objects.create(employee=self.e1,
                                                 office=self.o1)
        self.eo2 = EmployeeOffice.objects.create(employee=self.e2,
                                                 office=self.o1)
        self.eo3 = EmployeeOffice.objects.create(employee=self.e3,
                                                 office=self.o2)
        self.eo4 = EmployeeOffice.objects.create(employee=self.e4,
                                                 office=self.o2)

        e5 = self.e5
        o3 = self.o3
        new_objs = [EmployeeOffice(employee=e5, office=o3)]

        ret = bulk_sync(new_models=new_objs,
                        filters=None,
                        key_fields=("office_id", "employee_id"),
                        skip_deletes=True,
                        skip_updates=True,
                        select_for_update_of=("self", ))

        self.assertEqual(0, ret["stats"]["updated"])
        self.assertEqual(0, ret["stats"]["deleted"])
        self.assertEqual(1, ret["stats"]["created"])

        eos = o3.employees.all()
        self.assertEqual(1, eos.count())
        self.assertEqual(o3, e5.office_set.first())
Example #16
0
def update_cityzipcode():
    with closing(requests.get(CITY_ZIPCODES_URL, stream=True)) as source:
        reader = csv.DictReader(codecs.iterdecode(source.iter_lines(),
                                                  'utf-8'),
                                delimiter='|')
        new_models = [
            AirNowReportingAreaZipCode(city=row['City'],
                                       state=row['State'],
                                       zipcode=int(row['Zipcode']),
                                       location=Point(float(row['Longitude']),
                                                      float(row['Latitude'])))
            for row in reader
        ]

    _print_status(bulk_sync(new_models, ('zipcode', ), None))
Example #17
0
    def get_stats(self):
        """
        Загружает все действия AdvCake в базу за все время или с последнего необработанного действия или с последнейго действия.
        :return:
        """
        actions = []
        positions = []

        batch_date_to = date_to = datetime.datetime.now(pytz.timezone('Europe/Moscow')).date() - datetime.timedelta(
            days=1)  # вчера
        date_from = AdvCakeAction.get_stats_date_from()  # None, значит действий нет, загружаем все данные
        batch_date_from = batch_date_to - datetime.timedelta(days=70 - 1)

        # Загружаем данные блоками по max_days, пока не получим пустой или действие раньше date_from
        while True:
            # загружаем статистику включая данный в date_from и date_to
            r = requests.get(
                self.stats_url.format(date_from=batch_date_from.isoformat(), date_to=batch_date_to.isoformat()))
            result = xmltodict.parse(r.text)
            # если в результатах нет действий, значит загрузили все данные
            if not result['items']:
                break
            for item in result['items']['item']:  # действия идут в обратном порядке. последним будет самое раннее
                action = self.action_from_api_item(item)
                actions.append(action)
                positions.extend(self.positions_from_api_item(action))
            # если получили действие с датой меньше или равно date_from, останавливаемся
            if date_from and actions[-1].action_time.date() <= date_from:
                break
            # получаем данный за предыдыдущие 70 дней
            batch_date_to = batch_date_from - datetime.timedelta(days=1)
            batch_date_from = batch_date_to - datetime.timedelta(days=70 - 1)
        filter = Q(action_time__gte=date_from) if date_from else Q(pk__isnull=False)
        bulk_sync(new_models=actions, key_fields=['id'], filters=filter)  # сохраняем действия
        bulk_sync(new_models=positions, key_fields=['id'],
                  filters=Q(action__in=actions))  # сохраняем позиции в действии
Example #18
0
    def test_skip_creates(self):
        c1 = Company.objects.create(name="My Company LLC")

        e1 = Employee.objects.create(name="Scott", age=40, company=c1)
        e2 = Employee.objects.create(name="Isaac", age=9, company=c1)

        # create a new employee that will be ignored
        new_objs = [Employee(name="John", age=52, company=c1)]

        ret = bulk_sync(new_models=new_objs, filters=None, key_fields=("name",), skip_creates=True, skip_deletes=True)

        self.assertEqual(2, Employee.objects.count())
        self.assertEqual(["Scott", "Isaac"], [x.name for x in Employee.objects.all().order_by("id")])

        self.assertEqual(0, ret["stats"]["updated"])
        self.assertEqual(0, ret["stats"]["created"])
        self.assertEqual(0, ret["stats"]["deleted"])
Example #19
0
def load_okpd2():
    csv_model_names_matching = {
        'Name': 'name',
        'global_id': 'global_id',
        'Razdel': 'section',
        'Kod': 'code',
        'Nomdescr': 'description'
    }

    file_path = os.path.join(data_dir_path, "okpd2.csv")
    models = import_data_from_csv(Okpd2, file_path, csv_model_names_matching)

    result = bulk_sync(new_models=models,
                       filters=[],
                       key_fields=['code', 'name', 'global_id'])
    print("Results of Okpd2 bulk_sync: "
          "{created} created, {updated} updated, {deleted} deleted.".format(
              **result['stats']))
Example #20
0
    def test_new_objs_with_unprepped_field_values_are_processed_correctly(self):
        c1 = Company.objects.create(name="Foo Products, Ltd.")
        c2 = Company.objects.create(name="Bar Microcontrollers, Inc.")
        e1 = Employee.objects.create(name="Scott", age=40, company=c1)

        new_objs = [Employee(name="Scott", age="40", company=c2)]

        ret = bulk_sync(
            new_models=new_objs,
            filters=None,
            key_fields=("name", "age"),
        )

        # we should should update e1's company to c2
        self.assertEqual(1, ret["stats"]["updated"])
        self.assertEqual(c2, Employee.objects.get(pk=e1.pk).company)

        # we should not create or delete anything
        self.assertEqual(0, ret["stats"]["created"])
        self.assertEqual(0, ret["stats"]["deleted"])
        self.assertEqual(1, Employee.objects.count())
Example #21
0
def search_folder(service, source, folder):
    print("Searching drive: {}\n".format(source.id))

    # maintain list of cards, cardbacks, and tokens found for this Source
    q_cards = []
    q_cardbacks = []
    q_tokens = []

    # crawl the drive to retrieve a complete list of images it contains
    images = crawl_drive(service, folder)
    print("Number of images found: {}".format(len(images)))

    # add the retrieved cards to the database
    for x in images:
        add_card(folder, source, x, q_cards, q_cardbacks, q_tokens)

    print("\nFinished crawling {}.\nSynchronising to database.".format(folder["name"]))

    # set up key fields and filter for bulk_sync on this Source, then synchronise the located cards
    t0 = time.time()
    key_fields = ("id",)
    source_filter = Q(source=source.id)

    # Synchronise q_cards with Cards, q_cardbacks with Cardbacks, and q_tokens with Tokens
    queue_object_map = [(q_cardbacks, Cardback), (q_tokens, Token), (q_cards, Card)]

    for x in queue_object_map:
        ret1 = bulk_sync(
            new_models=x[0], key_fields=key_fields, filters=source_filter, db_class=x[1]
        )

    print(
        "Finished synchronising to database, which took {} seconds.\n".format(
            time.time() - t0
        )
    )
Example #22
0
def save_urls(start_page_path: str,
              start_page_num: int = 1,
              start_org_num: int = 1,
              end_page_num: int = None,
              end_org_num=None):
    parser = ListOrgParser()
    try:
        orgs_urls = parser.parse_orgs_list_pages(
            orgs_list_page_path=start_page_path,
            start_page_num=start_page_num,
            start_org_num=start_org_num,
            end_page_num=end_page_num,
            end_org_num=end_org_num)
        orgs_urls = [OrganizationUrl(**org_url) for org_url in orgs_urls]
        ret = bulk_sync(new_models=orgs_urls,
                        filters=[],
                        key_fields=('url', ),
                        skip_deletes=True)
    except Exception as e:
        logger.exception(str(e))
    else:
        logger.info(
            "Results of bulk_sync: {created} created, {updated} updated, {deleted} deleted."
            .format(**ret['stats']))
Example #23
0
def _scrape_comments_insta(start_date, insta):
    logger.info(f"Scraping Instagram comments for {insta.name}")
    dfs = quintly.get_insta_comments(insta.quintly_profile_id,
                                     start_date=start_date)

    post_cache: Dict[str, InstaPost] = {}

    for df in dfs:
        comments = list(_scrape_comments_insta_day(insta, post_cache, df))

        # bulk_sync breaks if there are no comments
        if not comments:
            continue

        posts = set(comment.post for comment in comments)

        sync_results = bulk_sync(
            comments,
            ["external_id"],
            Q(post__in=posts),
            batch_size=100,
            skip_deletes=True,
        )
        logger.debug(sync_results)
Example #24
0
    def handle(self, *args, **options):
        years = [2016, 2017, 2018, 2019, 2020]
        final_passing_data = pd.DataFrame()
        final_rushing_data = pd.DataFrame()
        final_receiving_data = pd.DataFrame()

        for year in years:
            current_passing_data = pd.read_csv(
                f"https://github.com/mrcaseb/nfl-data/blob/master/data/ngs/ngs_{year}_passing.csv.gz?raw=True",
                compression='gzip',
                low_memory=False,
            )

            final_passing_data = final_passing_data.append(
                current_passing_data, sort=True)

            current_rushing_data = pd.read_csv(
                f"https://github.com/mrcaseb/nfl-data/blob/master/data/ngs/ngs_{year}_rushing.csv.gz?raw=True",
                compression='gzip',
                low_memory=False,
            )

            final_rushing_data = final_rushing_data.append(
                current_rushing_data, sort=True)

            current_receiving_data = pd.read_csv(
                f"https://github.com/mrcaseb/nfl-data/blob/master/data/ngs/ngs_{year}_receiving.csv.gz?raw=True",
                compression='gzip',
                low_memory=False,
            )

            final_receiving_data = final_receiving_data.append(
                current_receiving_data, sort=True)

        #Give each row a unique index
        final_passing_data.reset_index(drop=True, inplace=True)
        final_rushing_data.reset_index(drop=True, inplace=True)
        final_receiving_data.reset_index(drop=True, inplace=True)

        # Converting NAs to None
        final_receiving_data = final_receiving_data.where(
            pd.notnull(final_receiving_data), None)

        # Changing the column names
        final_passing_data = final_passing_data.rename(
            columns={
                "player_first_name": "first_name",
                "player_last_name": "last_name",
                "player_display_name": "full_name",
                "player_short_name": "short_name",
                "player_gsis_id": "gsis_id",
                "player_position": "position",
                "team_abbr": "team",
            })
        final_rushing_data = final_rushing_data.rename(
            columns={
                "player_first_name": "first_name",
                "player_last_name": "last_name",
                "player_display_name": "full_name",
                "player_short_name": "short_name",
                "player_gsis_id": "gsis_id",
                "player_position": "position",
                "team_abbr": "team",
            })
        final_receiving_data = final_receiving_data.rename(
            columns={
                "player_first_name": "first_name",
                "player_last_name": "last_name",
                "player_display_name": "full_name",
                "player_short_name": "short_name",
                "player_gsis_id": "gsis_id",
                "player_position": "position",
                "team_abbr": "team",
            })

        # Getting all unique players
        passing_players = final_passing_data[NAME_COLUMNS].drop_duplicates(
            subset=['gsis_id'], keep='last')
        rushing_players = final_rushing_data[NAME_COLUMNS].drop_duplicates(
            subset=['gsis_id'], keep='last')
        receiving_players = final_receiving_data[NAME_COLUMNS].drop_duplicates(
            subset=['gsis_id'], keep='last')

        # TO GET UNIQUE PLAYERS ACROSS ALL THREE
        players = passing_players.append(rushing_players, sort=True)
        players = players.append(receiving_players, sort=True)
        players = players.drop_duplicates(subset=['gsis_id'], keep='last')

        # Adding players to database; if they exist, update them
        new_players = [Player(**vals) for vals in players.to_dict('records')]
        key_fields = ('gsis_id', )
        ret = bulk_sync(new_models=new_players,
                        filters=None,
                        key_fields=key_fields,
                        skip_deletes=True)

        # Adding passing stats to database; if they exist, update them
        final_passing_data = final_passing_data.drop(
            columns=PASS_STATS_TO_REMOVE)
        new_passing_data = [
            PassingStats(**vals)
            for vals in final_passing_data.to_dict('records')
        ]
        key_fields = ('gsis_id', 'season', 'week')
        ret = bulk_sync(new_models=new_passing_data,
                        filters=None,
                        key_fields=key_fields,
                        skip_deletes=True)

        # Adding rushing stats to database; if they exist, update them
        final_rushing_data = final_rushing_data.drop(
            columns=RUSH_STATS_TO_REMOVE)
        new_rushing_data = [
            RushingStats(**vals)
            for vals in final_rushing_data.to_dict('records')
        ]
        key_fields = ('gsis_id', 'season', 'week')
        ret = bulk_sync(new_models=new_rushing_data,
                        filters=None,
                        key_fields=key_fields,
                        skip_deletes=True)

        # Adding rushing stats to database; if they exist, update them
        final_receiving_data = final_receiving_data.drop(
            columns=REC_STATS_TO_REMOVE)
        new_receiving_data = [
            ReceivingStats(**vals)
            for vals in final_receiving_data.to_dict('records')
        ]
        key_fields = ('gsis_id', 'season', 'week')
        ret = bulk_sync(new_models=new_receiving_data,
                        filters=None,
                        key_fields=key_fields,
                        skip_deletes=True)
Example #25
0
    def process_uploaded_file(self, request: HttpRequest, file: UploadedFile):
        """Implementation to handle uploaded files, required by the ``UploadFileMixin``.
        Parses files exported from the YouTube Analytics backend and creates the
        corresponding entries in the database.
        Some checks were added to provide hints to the user if the file has an unexpected
        structure.
        Results are saved in
        :class:`~okr.models.youtube.YouTubeVideoAnalyticsExtra`.
        Args:
            request (HttpRequest): The request generated by the upload form
            file (UploadedFile): The uploaded file
        """
        logger.info("Uploaded file: {}", file.name)

        try:
            df = pd.read_csv(self.open_zip(file)["Table data.csv"])
        except zipfile.BadZipFile:
            self.message_user(
                request,
                f'Datei "{file.name}" ist keine ZIP-Datei. Bitte lade die Datei so hoch, wie sie aus YouTube Studio raus kommt.',
                level=messages.ERROR,
            )
            return

        logger.debug(df)

        df.fillna(0, inplace=True)

        filename = file.name
        name_pattern = r"Video (\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2}).*"
        parsed_name = re.match(name_pattern, filename)

        if not parsed_name:
            self.message_user(
                request,
                f'Datei "{filename}" ist nicht der richtige Export (Dateiname beginnt nicht mit "Video"). Wähle zum Export den Tab "Video" in den YouTube Studio Analytics.',
                level=messages.ERROR,
            )
            return

        start_date, end_date = parsed_name.groups()
        start_date = date.fromisoformat(start_date)
        end_date = date.fromisoformat(end_date)

        if end_date - start_date > timedelta(days=1, hours=2):
            self.message_user(
                request,
                f'Datei "{filename}" scheint mehr als einen Tag zu umfassen',
                level=messages.ERROR,
            )
            return

        if "Video" not in df.columns:
            self.message_user(
                request,
                f'Datei "{filename}" enthält keine Video-Zahlen! '
                'Wähle den Tab "Video" in den YouTube Studio Analytics.',
                level=messages.ERROR,
            )
            return

        if "Impressions" not in df.columns:
            self.message_user(
                request,
                f'Datei "{filename}" enthält keine Impressions! '
                'Wähle den Tab "Video" in den YouTube Studio Analytics.',
                level=messages.ERROR,
            )
            return

        if "Impressions click-through rate (%)" not in df.columns:
            self.message_user(
                request,
                f'Datei "{filename}" enthält keine Click-Through-Rate! '
                'Wähle den Tab "Video" in den YouTube Studio Analytics.',
                level=messages.ERROR,
            )
            return

        df = df[df["Video"] != "Total"]
        df["Clicks"] = (df["Impressions click-through rate (%)"] *
                        df["Impressions"] / 100).astype("int")

        youtube_videos = YouTubeVideo.objects.filter(
            external_id__in=df["Video"].unique().tolist())

        new_models = []
        youtube_videos = {video.external_id: video for video in youtube_videos}

        for index, row in df.iterrows():
            if row["Video"] not in youtube_videos:
                logger.warning(
                    "Could not find YouTube video with external_id {}",
                    row["Video"])
                continue

            new_models.append(
                YouTubeVideoAnalyticsExtra(
                    youtube_video=youtube_videos[row["Video"]],
                    date=start_date,
                    impressions=row["Impressions"],
                    clicks=row["Clicks"],
                ))

        result = bulk_sync(
            new_models=new_models,
            filters=Q(date=start_date),
            key_fields=["youtube_video_id", "date"],
            skip_deletes=True,
        )

        logger.info(result)

        self.message_user(request,
                          f'Datei "{filename}" erfolgreich eingelesen!')