class TwitterSearch():
    __solr = None
    __core = None
    __api = None

    def __init__(self, oauth):
        super().__init__()
        self.__solr = SolrClient(iu.solr_url)
        self.__core = iu.solr_core_tweets
        self.__api = tweepy.API(oauth)

    def index(self, keywords):
        for keyword in keywords:
            count = 0
            for status in tweepy.Cursor(self.__api.search,
                                        q=keyword,
                                        tweet_mode="extended",
                                        lang="en").items(500):
                count += 1
                # created_at_time
                str_created_at = status.created_at
                str_solr_time = str_created_at.utcnow().strftime(
                    SOLR_TIME_PATTERN)
                docs = [{
                    'id': status.id,
                    'created_at': str_solr_time,
                    'status_text': status.full_text
                }]
                self.__solr.index(self.__core, docs)
            print(str(count) + "," + keyword)
        code = iu.commit(iu.solr_core_tweets)
Esempio n. 2
0
def update_pmi_scores(existing_tags: dict,
                      existing_tag_pairs: dict,
                      solr: SolrClient, core_name, batch_commit):
    count = 0
    batch = []
    for tag_pair, data in existing_tag_pairs.items():
        count += 1
        if count > batch_commit:
            solr.index(core_name, batch)
            code = util.commit(core_name)
            count = 0
            batch = []
            logger.info("\t done batch size={}".format(batch_commit))

        co_freq = data[util.tag_index_field_frequency]
        tags = tag_pair.split(" ")
        t1_freq = existing_tags[tags[0]][util.tag_index_field_frequency]
        t2_freq = existing_tags[tags[1]][util.tag_index_field_frequency]

        if co_freq==0:
            pmi=0
        else:
            pmi = numpy.emath.log(co_freq / (t1_freq * t2_freq + util.score_denominator_min))
        data[util.tag_index_field_pmi] = pmi
        data[util.tag_index_field_text] =tag_pair
        data[util.tag_index_field_type] =1
        batch.append(data)

    # commit the rest
    solr.index(core_name, batch)
    code = util.commit(core_name)
Esempio n. 3
0
def update_tagrisk_scores(existing_tags: dict,
                          solr: SolrClient, core_name, batch_commit):
    count = 0
    batch = []
    for tag, data in existing_tags.items():
        count += 1
        if count > batch_commit:
            solr.index(core_name, batch)
            code = util.commit(core_name)
            count = 0
            batch = []
            logger.info("\t done batch size={}".format(batch_commit))

        freq = data[util.tag_index_field_frequency]
        freqh = data[util.tag_index_field_frequencyh]

        if freqh==0:
            riskscore=0
        else:
            riskscore = numpy.math.sqrt(freqh / (freq+ util.score_denominator_min))
        data[util.tag_index_field_risk_score] = riskscore
        data[util.tag_index_field_text] =tag
        data[util.tag_index_field_type] =0
        batch.append(data)

    # commit the rest
    solr.index(core_name, batch)
    code = util.commit(core_name)
Esempio n. 4
0
def update_ml_tag(solr: SolrClient, tweets_core_name, tags_core_name, docs,
                  feat_vectorizer, ml_model, selected_features,
                  hate_indicative_features, scaling_option, sysout, logger):
    tweets = []
    for d in docs:
        text = d['status_text']
        if "rt @" in text.lower():
            start = text.lower().index("rt @") + 4
            text = text[start].strip()

        tweets.append(text)

    #ml classify, also compute risk scores
    logger.info("begin ml classification for tweets={}, time={}".format(
        len(tweets), datetime.datetime.now()))
    tags, risk_scores = ml_tag(tweets, feat_vectorizer, ml_model,
                               selected_features, hate_indicative_features,
                               scaling_option, sysout, logger, solr,
                               tags_core_name)

    logger.info("ml classification done. updating solr index...{}".format(
        datetime.datetime.now()))

    count = 0
    for idx, tag in enumerate(tags):
        if tag == 0:
            count += 1
            #print(d['status_text'])
        d = docs[idx]
        d['ml_tag'] = str(tag)
        d['tweet_risk'] = risk_scores[idx]

    print(count)
    solr.index(tweets_core_name, docs)
    code = iu.commit(tweets_core_name)
Esempio n. 5
0
    def handle(self, *args, **options):

        total = 0
        cycle = 0

        try:
            # Retrieve the Search  and Field models from the database
            solr = SolrClient(settings.SOLR_SERVER_URL)
            try:
                self.search_target = Search.objects.get(
                    search_id=options['search'])
                self.solr_core = self.search_target.solr_core_name
                self.all_fields = Field.objects.filter(
                    search_id=self.search_target)
                if options['nothing_to_report']:
                    self.search_fields = Field.objects.filter(
                        search_id=self.search_target,
                        alt_format='ALL') | Field.objects.filter(
                            search_id=self.search_target, alt_format='NTR')
                else:
                    self.search_fields = Field.objects.filter(
                        search_id=self.search_target,
                        alt_format='ALL') | Field.objects.filter(
                            search_id=self.search_target, alt_format='')
                for search_field in self.search_fields:
                    self.csv_fields[search_field.field_id] = search_field

                    codes = Code.objects.filter(field_id=search_field)
                    # Most csv_fields will not  have codes, so the queryset will be zero length
                    if len(codes) > 0:
                        code_dict = {}
                        for code in codes:
                            code_dict[code.code_id.lower()] = code
                        self.field_codes[search_field.field_id] = code_dict

            except Search.DoesNotExist as x:
                self.logger.error('Search not found: "{0}"'.format(x))
                exit(-1)
            except Field.DoesNotExist as x1:
                self.logger.error(
                    'Fields not found for search: "{0}"'.format(x1))

            # Process the records in the CSV file one at a time
            with open(options['csv'],
                      'r',
                      encoding='utf-8-sig',
                      errors="ignore") as csv_file:
                csv_reader = csv.DictReader(csv_file, dialect='excel')
                solr_items = []
                for csv_record in csv_reader:

                    # Clear out the Solr core. on the first line
                    if total == 0 and not options['nothing_to_report']:
                        solr.delete_doc_by_query(self.solr_core, "*:*")
                        print("Purging all records")
                    elif total == 0 and options['nothing_to_report']:
                        solr.delete_doc_by_query(self.solr_core, "format:NTR")
                        solr.commit(self.solr_core, softCommit=True)
                        print("Purging NTR records")
                    total += 1
                    cycle += 1

                    # Call plugins if they exist for this search type. This is where a developer can introduce
                    # code to customize the data that is loaded into Solr for a particular search.
                    search_type_plugin = 'search.plugins.{0}'.format(
                        options['search'])
                    if search_type_plugin in self.discovered_plugins:
                        include, filtered_record = self.discovered_plugins[
                            search_type_plugin].filter_csv_record(
                                csv_record, self.search_target,
                                self.csv_fields, self.field_codes,
                                'NTR' if options['nothing_to_report'] else '')
                        if not include:
                            continue
                        else:
                            csv_record = filtered_record
                    # Create a dictionary for each record loaded into  Solr
                    solr_record = {
                        'format':
                        'NTR' if options['nothing_to_report'] else 'DEFAULT'
                    }
                    for csv_field in csv_reader.fieldnames:
                        # Verify that it is a known field
                        if csv_field not in self.csv_fields and csv_field not in (
                                'owner_org_title', 'owner_org'):
                            self.logger.error(
                                "CSV files contains unknown field: {0}".format(
                                    csv_field))
                            exit(-1)
                        if csv_field == 'owner_org_title':
                            continue

                        # Handle multi-valued fields here
                        if self.csv_fields[csv_field].solr_field_multivalued:
                            solr_record[csv_field] = csv_record[
                                csv_field].split(',')
                            # Copy fields fo report cannot use multi-values - so directly populate with original string
                            if self.csv_fields[csv_field].solr_field_export:
                                for extra_field in self.csv_fields[
                                        csv_field].solr_field_export.split(
                                            ','):
                                    solr_record[extra_field] = csv_record[
                                        csv_field]
                        else:
                            solr_record[csv_field] = csv_record[csv_field]

                        # Automatically expand out dates and numbers for use with Solr export handler
                        if self.csv_fields[
                                csv_field].solr_field_type == 'pdate':
                            try:
                                if csv_record[csv_field]:
                                    csv_date = datetime.strptime(
                                        csv_record[csv_field], '%Y-%m-%d')
                                    solr_record[csv_field +
                                                '_en'] = format_date(
                                                    csv_date, locale='en')
                                    solr_record[csv_field +
                                                '_fr'] = format_date(
                                                    csv_date, locale='fr')
                                    if self.csv_fields[
                                            csv_field].is_default_year:
                                        solr_record['year'] = csv_date.year
                                    if self.csv_fields[
                                            csv_field].is_default_month:
                                        solr_record['month'] = csv_date.month
                                else:
                                    solr_record[csv_field + '_en'] = ''
                                    solr_record[csv_field + '_fr'] = ''
                            except ValueError as x2:
                                self.logger.error(
                                    'Invalid date: "{0}"'.format(x2))
                                solr_record[csv_field] = ''
                                continue
                        elif self.csv_fields[csv_field].solr_field_type in [
                                'pint', 'pfloat'
                        ]:
                            if solr_record[csv_field]:
                                if solr_record[csv_field] == '.':
                                    solr_record[csv_field] = "0"
                                csv_decimal = parse_decimal(
                                    solr_record[csv_field], locale='en_US')
                                if self.csv_fields[
                                        csv_field].solr_field_is_currency:
                                    solr_record[csv_field +
                                                '_en'] = format_currency(
                                                    csv_decimal,
                                                    'CAD',
                                                    locale='en_CA')
                                    solr_record[csv_field +
                                                '_fr'] = format_currency(
                                                    csv_decimal,
                                                    'CAD',
                                                    locale='fr_CA')
                                else:
                                    solr_record[csv_field +
                                                '_en'] = format_decimal(
                                                    csv_decimal,
                                                    locale='en_CA')
                                    solr_record[csv_field +
                                                '_fr'] = format_decimal(
                                                    csv_decimal,
                                                    locale='fr_CA')
                            else:
                                solr_record[csv_field + '_en'] = ''
                                solr_record[csv_field + '_fr'] = ''

                        # Lookup the expanded code value from the codes dict of dict
                        if csv_field in self.field_codes:
                            if csv_record[csv_field]:

                                if self.csv_fields[
                                        csv_field].solr_field_multivalued:
                                    codes_en = []
                                    codes_fr = []
                                    for code_value in csv_record[
                                            csv_field].split(","):
                                        if code_value.lower(
                                        ) in self.field_codes[csv_field]:
                                            codes_en.append(
                                                self.field_codes[csv_field]
                                                [code_value.lower()].label_en)
                                            codes_fr.append(
                                                self.field_codes[csv_field]
                                                [code_value.lower()].label_fr)
                                        else:
                                            self.logger.info(
                                                "Unknown code value: {0} for field: {1}"
                                                .format(code_value, csv_field))
                                    solr_record[csv_field + '_en'] = codes_en
                                    solr_record[csv_field + '_fr'] = codes_fr
                                else:
                                    if csv_record[csv_field].lower(
                                    ) in self.field_codes[csv_field]:
                                        solr_record[csv_field +
                                                    '_en'] = self.field_codes[
                                                        csv_field][csv_record[
                                                            csv_field].lower(
                                                            )].label_en
                                        solr_record[csv_field +
                                                    '_fr'] = self.field_codes[
                                                        csv_field][csv_record[
                                                            csv_field].lower(
                                                            )].label_fr
                                    else:
                                        self.logger.info(
                                            "Unknown code value: {0} for field: {1}"
                                            .format(csv_record[csv_field],
                                                    csv_field))
                    solr_record = self.set_empty_fields(solr_record)
                    # Set the Solr ID field (Nothing To Report records are excluded)
                    if not options['nothing_to_report']:
                        if self.search_target.id_fields:
                            id_values = []
                            for id_field in self.search_target.id_fields.split(
                                    ","):
                                id_values.append(csv_record[id_field])
                            solr_record['id'] = ",".join(id_values)
                    else:

                        if 'month' in solr_record:
                            solr_record['id'] = "{0}-{1}-{2}".format(
                                solr_record['owner_org'], solr_record['year'],
                                solr_record['month'])
                        elif 'quarter' in solr_record:
                            solr_record['id'] = "{0}-{1}-{2}".format(
                                solr_record['owner_org'], solr_record['year'],
                                solr_record['quarter'])

                    # Call plugins if they exist for this search type. This is where a developer can introduce
                    # code to customize the data that is loaded into Solr for a particular search.
                    if search_type_plugin in self.discovered_plugins:
                        solr_record = self.discovered_plugins[
                            search_type_plugin].load_csv_record(
                                csv_record, solr_record, self.search_target,
                                self.csv_fields, self.field_codes,
                                'NTR' if options['nothing_to_report'] else '')

                    solr_items.append(solr_record)

                    # Write to Solr whenever the cycle threshold is reached
                    if cycle >= self.cycle_on:
                        # try to connect to Solr up to 10 times
                        for countdown in reversed(range(10)):
                            try:
                                solr.index(self.solr_core, solr_items)
                                print("{0} rows processed".format(total))
                                cycle = 0
                                solr_items.clear()
                                break
                            except ConnectionError as cex:
                                if not countdown:
                                    raise
                                print(
                                    "Solr error: {0}. Waiting to try again ... {1}"
                                    .format(cex, countdown))
                                time.sleep((10 - countdown) * 5)

                # Write and remaining records to Solr and commit
                if cycle > 0:
                    # try to connect to Solr up to 10 times
                    for countdown in reversed(range(10)):
                        try:
                            solr.index(self.solr_core, solr_items)
                            total += len(solr_items)
                            print("{0} rows processed".format(cycle))
                            cycle = 0
                            solr_items.clear()
                            break
                        except ConnectionError as cex:
                            if not countdown:
                                raise
                            print(
                                "Solr error: {0}. Waiting to try again ... {1}"
                                .format(cex, countdown))
                            time.sleep((10 - countdown) * 5)

                solr.commit(self.solr_core, softCommit=True, waitSearcher=True)
                print("Total rows processed: {0}".format(total))

        except Exception as x:
            self.logger.error('Unexpected Error "{0}"'.format(x))
class TwitterStream(StreamListener):
    __solr = None
    __core = None
    __count = 0
    __count_retweet = 0
    __google_maps = None

    def __init__(self, google_api_key):
        super().__init__()
        self.__solr = SolrClient(iu.solr_url)
        self.__core = iu.solr_core_tweets
        self.__google_maps = GoogleMaps(api_key=google_api_key)
        # self.__ml_model=util.load_ml_model(ml_model_file)
        # self.__selected_features = mutil.read_preselected_features(False, ml_selected_features)

    def ignoreRetweet(self, status_text):
        if "rt @" in status_text.lower():
            self.__count_retweet += 1
        if IGNORE_RETWEETS:
            return True
        return False

    def on_data(self, data):
        self.__count += 1
        #print(self.__count)
        if self.__count % 200 == 0:
            code = urllib.request. \
                code = iu.commit(iu.solr_core_tweets)
            now = datetime.datetime.now()
            print("{} processed: {}, where {} are retweets".format(
                now, self.__count, self.__count_retweet))
            logger.info("{} processed: {}, where {} are retweets".format(
                now, self.__count, self.__count_retweet))
        jdata = None
        try:
            jdata = json.loads(data)

            if jdata is not None and "id" in jdata.keys() \
                    and not self.ignoreRetweet(jdata["text"]):
                # created_at_time
                str_created_at = jdata["created_at"]
                time = datetime.datetime.strptime(str_created_at,
                                                  TWITTER_TIME_PATTERN)
                str_solr_time = time.utcnow().strftime(SOLR_TIME_PATTERN)

                # entities hashtags
                hashtags = jdata["entities"]["hashtags"]
                hashtag_list = []
                for hashtag in hashtags:
                    hashtag_list.append(hashtag["text"].lower())

                # entities urls
                urls = jdata["entities"]["urls"]
                url_list = []
                for url in urls:
                    url_list.append(url["expanded_url"])

                # entities symbols
                symbols = jdata["entities"]["symbols"]
                symbols_list = []
                for symbol in symbols:
                    symbols_list.append(symbol["text"])

                # entities user_mentions
                user_mentions = jdata["entities"]["user_mentions"]
                user_mention_list = []
                for um in user_mentions:
                    user_mention_list.append(um["id"])

                # quoted status id if exists
                if "quoted_status_id" in jdata:
                    quoted_status_id = jdata["quoted_status_id"]
                else:
                    quoted_status_id = None

                # place exists
                place = jdata["place"]
                if place is not None:
                    place_full_name = place["full_name"]
                    place_coordinates = place['bounding_box']['coordinates'][
                        0][0]
                else:
                    place_full_name = None
                    place_coordinates = None

                coordinates = jdata["coordinates"]
                # user_location, only compute geocode if other means have failed
                geocode_coordinates_of_user_location = []
                str_user_loc = jdata["user"]["location"]
                if str_user_loc is not None and "," in str_user_loc:
                    str_user_loc = str_user_loc.split(",")[0].strip()
                if str_user_loc is not None and len(
                        str_user_loc
                ) < 25 and coordinates is None and place_full_name is None:
                    geocode_obj = None
                    if str_user_loc in LOCATION_COORDINATES.keys():
                        geocode_obj = LOCATION_COORDINATES[str_user_loc]
                    else:
                        # geocode_obj=None #currently the api for getting geo codes seems to be unstable
                        try:
                            geocode_obj = self.__google_maps.search(
                                location=str_user_loc)
                            if (geocode_obj is not None):
                                geocode_obj = geocode_obj.first()
                            LOCATION_COORDINATES[str_user_loc] = geocode_obj
                            if geocode_obj is not None:
                                geocode_coordinates_of_user_location.append(
                                    geocode_obj.lat)
                                geocode_coordinates_of_user_location.append(
                                    geocode_obj.lng)
                        except Exception as exc:
                            #traceback.print_exc(file=sys.stdout)
                            print("\t\t gmap error={}".format(
                                str_user_loc, exc))
                            logger.error("\t\t gmap: {}".format(str_user_loc))
                            try:
                                geocode_obj = geolocator.geocode(str_user_loc)
                                LOCATION_COORDINATES[
                                    str_user_loc] = geocode_obj
                                if geocode_obj is not None:
                                    geocode_coordinates_of_user_location.append(
                                        geocode_obj.latitude)
                                    geocode_coordinates_of_user_location.append(
                                        geocode_obj.longitude)
                            except Exception as exc:
                                #traceback.print_exc(file=sys.stdout)
                                print("\t\t GeoPy error={}".format(
                                    str_user_loc, exc))
                                logger.error(
                                    "\t\t GeoPy {}".format(str_user_loc))

                # ml_tag=util.ml_tag(jdata['text'], feat_vectorizer,self.__ml_model, self.__selected_features,
                #                    SCALING_STRATEGY, self.__sysout, logger)
                ml_tag = '0' if random.random() < 0.2 else '2'
                tweet_risk = random.uniform(0, 1.0)

                if coordinates == None:
                    coordinates = place_coordinates
                if coordinates == None:
                    coordinates = geocode_coordinates_of_user_location

                coord_lat = None
                coord_lon = None
                if coordinates is not None and len(coordinates) > 0:
                    coord_lat = coordinates[0]
                    coord_lon = coordinates[1]

                docs = [{
                    'id':
                    jdata["id"],
                    'created_at':
                    str_solr_time,
                    'coordinate_lat':
                    coord_lat,
                    'coordinate_lon':
                    coord_lon,
                    'favorite_count':
                    jdata["favorite_count"],
                    'in_reply_to_screen_name':
                    jdata["in_reply_to_screen_name"],
                    'in_reply_to_status_id':
                    jdata["in_reply_to_status_id"],
                    'in_reply_to_user_id':
                    jdata["in_reply_to_user_id"],
                    'lang':
                    jdata["lang"],
                    'place_full_name':
                    place_full_name,
                    'place_coordinates':
                    place_coordinates,
                    'retweet_count':
                    jdata["retweet_count"],
                    'retweeted':
                    jdata["retweeted"],
                    'quoted_status_id':
                    quoted_status_id,
                    'status_text':
                    jdata["text"],
                    'entities_hashtag':
                    hashtag_list,
                    'entities_symbol':
                    symbols_list,
                    'entities_url':
                    url_list,
                    'entities_user_mention':
                    user_mention_list,
                    'user_id':
                    jdata["user"]["id"],
                    'user_screen_name':
                    jdata["user"]["screen_name"],
                    'user_statuses_count':
                    jdata["user"]["statuses_count"],
                    'user_friends_count':
                    jdata["user"]["friends_count"],
                    'user_followers_count':
                    jdata["user"]["followers_count"],
                    'user_location':
                    str_user_loc,
                    'user_location_coordinates':
                    geocode_coordinates_of_user_location,
                    'ml_tag':
                    ml_tag,
                    "tweet_risk":
                    tweet_risk
                }]
                self.__solr.index(self.__core, docs)
        except Exception as exc:
            traceback.print_exc(file=sys.stdout)
            print(
                "Error encountered for {}, error:{} (see log file for details)"
                .format(self.__count, exc))
            if jdata is not None and "id" in jdata.keys():
                tweet_id = jdata["id"]
            else:
                tweet_id = "[failed to parse]"
            logger.info(
                "Error encountered for counter={}, tweet={}, error:{} (see log file for details)"
                .format(self.__count, tweet_id, exc))
            if jdata is not None:
                file = LOG_DIR + "/" + str(tweet_id) + ".txt"
                logger.info("\t input data json written to {}".format(file))
                with open(file, 'w') as outfile:
                    json.dump(jdata, outfile)
            pass
        return (True)

    def on_error(self, status):
        print(status)

    def on_status(self, status):
        print(status.text)
Esempio n. 7
0
import os
import json
import requests

CC_LINKS_FILES_DIRECTORIES = []
SOLR_INSTANCE_URL = ""
SOLR_CORE = ""

solr_client = SolrClient(SOLR_INSTANCE_URL)


def get_url_content(url):
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36'
    headers = {'User-Agent': user_agent}
    resp = requests.get(url, headers=headers)
    return resp.text


for directory in CC_LINKS_FILES_DIRECTORIES:
    files = [f for f in os.listdir(directory) if os.path.isfile(f)]
    for file in files:
        docs = []
        with open(directory + '/' + file, 'r') as f:
            for line in f:
                json_obj = json.loads(line, encoding="utf-8")
                url = json_obj["url"]
                text = get_url_content(url)
                docs.append({"file_name": file, "html": text})
            solr_client.index(SOLR_CORE, docs)
            solr_client.commit(SOLR_CORE, openSearcher=True)
Esempio n. 8
0
class TwitterStream(StreamListener):
    #https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
    __solr_url = None
    __solr = None
    __core = None
    __count = 0
    __count_retweet = 0

    def __init__(self, solr_url):
        super().__init__()
        self.__solr_url = solr_url
        self.__solr = SolrClient(solr_url)
        self.__core = SOLR_CORE_TWEETS

    def on_data(self, data):
        self.__count += 1
        if self.__count % COMMIT_BATCH_SIZE == 0:
            code = commit(SOLR_CORE_TWEETS, self.__solr_url)
            now = datetime.datetime.now()
            print("{} processed: {}".format(now, self.__count))
            logger.info("{} processed: {}".format(now, self.__count))
        jdata = None
        try:
            jdata = json.loads(data)

            if jdata is not None and "id" in jdata.keys():
                # created_at_time
                str_created_at = jdata["created_at"]
                time = datetime.datetime.strptime(str_created_at,
                                                  TWITTER_TIME_PATTERN)
                str_solr_time = time.utcnow().strftime(SOLR_TIME_PATTERN)

                doc = {
                    'id': jdata["id_str"],
                    'created_at': str_solr_time,
                    'lang': jdata["lang"]
                }

                if "extended_tweet" in jdata:
                    doc['status_text'] = jdata["extended_tweet"]["full_text"]
                else:
                    doc['status_text'] = jdata["text"]
                self.collect_tweet_entities(doc, jdata)
                self.collect_tweet_quote_info(doc, jdata)
                self.collect_retweet_info(doc, jdata)
                self.collect_tweet_favorite_info(doc, jdata)
                self.collect_tweet_location_info(doc, jdata)
                self.collect_tweet_reply_info(doc, jdata)
                self.collect_tweet_user_info(doc, jdata)

                self.__solr.index(self.__core, [doc])
        except Exception as exc:
            traceback.print_exc(file=sys.stdout)
            print(
                "Error encountered for {}, error:{} (see log file for details)"
                .format(self.__count, exc))
            if jdata is not None and "id" in jdata.keys():
                tweet_id = jdata["id"]
            else:
                tweet_id = "[failed to parse]"
            logger.info(
                "Error encountered for counter={}, tweet={}, error:{} (see log file for details)"
                .format(self.__count, tweet_id, exc))
            if jdata is not None:
                file = LOG_DIR + "/" + str(tweet_id) + ".txt"
                logger.info("\t input data json written to {}".format(file))
                with open(file, 'w') as outfile:
                    json.dump(jdata, outfile)
            pass
        return (True)

    def on_error(self, status):
        print(status)

    def on_status(self, status):
        print(status.text)

    def collect_tweet_entities(self, doc: dict, tweet_json: dict):
        ##################### tweet entities ###################
        # entities hashtags
        if "extended_tweet" in tweet_json:
            entities = tweet_json["extended_tweet"]["entities"]
        else:
            entities = tweet_json["entities"]
        hashtags = entities["hashtags"]
        if 'entities_hashtag' in doc:
            hashtag_list = doc['entities_hashtag']
        else:
            hashtag_list = []
        for hashtag in hashtags:
            h = hashtag["text"].lower()
            if not h in hashtag_list:
                hashtag_list.append(h)

        # entities urls
        urls = entities["urls"]
        if 'entities_url' in doc:
            url_list = doc['entities_url']
        else:
            url_list = []
        for url in urls:
            url_list.append(url["expanded_url"])

        # entities symbols
        symbols = entities["symbols"]
        if 'entities_symbol' in doc:
            symbols_list = doc['entities_symbol']
        else:
            symbols_list = []
        for symbol in symbols:
            s = symbol["text"].lower()
            if not s in symbols_list:
                symbols_list.append(s)

        # entities user_mentions
        user_mentions = entities["user_mentions"]
        if 'entities_user_mention' in doc:
            user_mention_list = doc['entities_user_mention']
        else:
            user_mention_list = []
        for um in user_mentions:
            id = str(um["id_str"]).lower()
            if not id in user_mention_list:
                user_mention_list.append(id)

        # media
        if "extended_tweet" in tweet_json and "extended_entities" in tweet_json[
                "extended_tweet"]:
            ext_entities = tweet_json["extended_tweet"]["extended_entities"]
        elif "extended_entities" in tweet_json:
            ext_entities = tweet_json["extended_entities"]
        else:
            ext_entities = None
        if ext_entities is not None:
            media = ext_entities["media"]
            if media is not None:
                doc['entities_media_url'] = media[0]["media_url"]
                doc['entities_media_type'] = media[0]["type"]

        doc['entities_hashtag'] = hashtag_list
        doc['entities_symbol'] = symbols_list
        doc['entities_url'] = url_list
        doc['entities_user_mention'] = user_mention_list

    def collect_tweet_quote_info(self, doc: dict, tweet_json: dict):
        #################  quote ####################
        # quoted status id if exists
        if "quoted_status_id_str" in tweet_json:
            quoted_status_id = tweet_json["quoted_status_id_str"]
            self.collect_tweet_entities(doc, tweet_json['quoted_status'])
        else:
            quoted_status_id = None
        doc['quoted_status_id_str'] = quoted_status_id
        doc['is_quote_status'] = tweet_json["is_quote_status"]
        if "quote_count" in tweet_json:
            doc['quote_count'] = tweet_json["quote_count"]

    def collect_tweet_reply_info(self, doc: dict, tweet_json: dict):
        if "in_reply_to_screen_name" in tweet_json:
            doc['in_reply_to_screen_name'] = tweet_json[
                "in_reply_to_screen_name"]
        if "in_reply_to_status_id_str" in tweet_json:
            doc['in_reply_to_status_id_str'] = tweet_json[
                "in_reply_to_status_id_str"]
        if "in_reply_to_user_id_str" in tweet_json:
            doc['in_reply_to_user_id_str'] = tweet_json[
                "in_reply_to_user_id_str"]
        doc['reply_count'] = tweet_json["reply_count"]

    def collect_retweet_info(self, doc: dict, tweet_json: dict):
        doc['retweet_count'] = tweet_json["retweet_count"]
        doc['retweeted'] = tweet_json["retweeted"]
        if "retweeted_status" in tweet_json:
            doc['retweeted_status_id_str'] = tweet_json["retweeted_status"][
                "id_str"]
            self.collect_tweet_entities(doc, tweet_json['retweeted_status'])

    def collect_tweet_favorite_info(self, doc: dict, tweet_json: dict):
        doc['favorite_count'] = tweet_json["favorite_count"]  #nullable

    def collect_tweet_user_info(self, doc: dict, tweet_json: dict):
        doc['user_id_str'] = tweet_json["user"]["id_str"]
        doc['user_screen_name'] = tweet_json["user"]["screen_name"]
        doc['user_statuses_count'] = tweet_json["user"]["statuses_count"]
        doc['user_friends_count'] = tweet_json["user"]["friends_count"]
        doc['user_followers_count'] = tweet_json["user"]["followers_count"]
        doc['user_desc'] = tweet_json["user"]["description"]

    def collect_tweet_location_info(self, doc: dict, tweet_json: dict):
        # place exists
        place = tweet_json["place"]
        if place is not None:
            place_full_name = place["full_name"]
            place_coordinates = place['bounding_box']['coordinates'][0][0]
        else:
            place_full_name = None
            place_coordinates = None

        coordinates = tweet_json["coordinates"]
        # user_location, only compute geocode if other means have failed

        # if coordinates == None:
        #     coordinates = place_coordinates
        #
        # coord_lat = None
        # coord_lon = None
        # if coordinates is not None and len(coordinates) > 0:
        #     coord_lat = coordinates[0]
        #     coord_lon = coordinates[1]

        #doc['coordinate_lat']=coord_lat
        #doc['coordinate_lon']=coord_lon
        doc['place_full_name'] = place_full_name
        doc['place_coordinates'] = place_coordinates