Esempio n. 1
0
def load_csv_data(csvfilename, fields=None):
    """
    Args:
        csvfilename:
        rowkeyname:
    """
    import os


    logmsg(f'Loading {csvfilename}...')
    from io import open
    csv_fp = open(csvfilename, 'r')
    data = []

    csv_reader = None
    try:
        with csv_fp:
            csv_reader = csv.DictReader(csv_fp, fieldnames=fields, delimiter=",", quoting=csv.QUOTE_ALL)
            fields = csv_reader.fieldnames
            for row in csv_reader:
                if list(row.values()) == fields:
                    continue
                if len(fields) == 1:
                    data.append(row[fields[0]])
                else:
                    data.append(row)

        logdebug(f'Loaded {len(data)} rows from {csvfilename}')
        return data

    except Exception as err:
        print(err)
        pass
Esempio n. 2
0
    def batch_tokenize_strings(self,
                               list_data,
                               field,
                               field_tokenized="tokenized",
                               ret_type="string",
                               maxlength=None):
        """
        Args:
            list_data:
            field: string
            field_tokenized: string
            ret_type: string
            :type field_tokenized: object
        """

        for k in list(list_data.keys()):
            if isinstance(k, str) and len(k) == 0:
                logmsg("String value for key was empty.  Skipping...")
                continue

            tokens = self.tokenize_string(list_data[k][field])
            final_tokens = []

            if maxlength and tokens:
                fieldval = "|{}|".join(tokens)
                while (len(fieldval) >= maxlength):
                    tokens.reverse()
                    tokens.pop()
                    tokens.reverse()
                    fieldval = "|{}|".join(tokens)

            if ret_type == "list":
                final_tokens = list(set(tokens))
            elif ret_type == "set":
                final_tokens = set(tokens)
            elif ret_type == "dict":
                final_tokens = OrderedDict(zip(tokens, tokens))
            else:
                # if ret_type == "string" or ret_type is None:
                final_tokens = f'|{"|".join(tokens)}|'

            list_data[k][field_tokenized] = final_tokens
        return list_data
Esempio n. 3
0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#  License for the specific language governing permissions and limitations
#  under the License.
###########################################################################
from task_dedupe_jobs import TaskDedupeJobPosting
from helpers import docopt_ext, COMMON_OPTIONS
from util_log import logmsg

cli_usage = """
Usage:
  cmd_mark_duplicates.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>)
  cmd_mark_duplicates.py --version
  
Options:
  -o <file>, --output <file>    output JSON file with ID pairs of duplicate listings 
  -i <file>, --input <file>     input JSON data file with job postings
""" + COMMON_OPTIONS

if __name__ == '__main__':
    arguments = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__)

    try:
        matcher = TaskDedupeJobPosting(**arguments)
        matcher.dedupe_jobs()
    except Exception as ex:
        logmsg(f'Unable to deduplicate job postings: {ex}')
        raise ex
Esempio n. 4
0
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#  License for the specific language governing permissions and limitations
#  under the License.
###########################################################################
from util_log import logmsg
from helpers import docopt_ext, COMMON_OPTIONS
from task_tokenize_jobtitles import TaskAddTitleTokens

cli_usage = """
Usage:
  cmd_update_title_tokens.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>)
  cmd_update_title_tokens.py --version

Options:
""" + COMMON_OPTIONS

if __name__ == '__main__':
    arguments = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__)

    try:
        toks = TaskAddTitleTokens(**arguments)
        toks.update_jobs_without_tokens()
    except Exception as ex:
        logmsg(f'Unable to update job title tokes: {ex}')
        raise ex

Esempio n. 5
0
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#  License for the specific language governing permissions and limitations
#  under the License.
###########################################################################
from task_find_and_match_places import FindPlacesFromDBLocationsTask
from helpers import docopt_ext, COMMON_OPTIONS
from util_log import logmsg

cli_usage = """
Usage:
  cmd_set_geolocations.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) -s <server>
  cmd_set_geolocations.py --version
  
Options:
  -s <server>, --server <server>            hostname for geocode api server [default: http://0.0.0.0:5000]
""" + COMMON_OPTIONS

if __name__ == '__main__':
    args = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__)

    try:
        matcher = FindPlacesFromDBLocationsTask(**args)
        matcher.update_all_locations(**args)
    except Exception as ex:
        logmsg(f'Unable to set geolocations: {ex}')
        raise ex
Esempio n. 6
0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#  License for the specific language governing permissions and limitations
#  under the License.
###########################################################################
from helpers import docopt_ext, COMMON_OPTIONS
from task_match_titles import TaskMatchJobsToKeywords
from util_log import logmsg

cli_usage = """
Usage:
  cmd_match_titles_to_keywords.py -i <file> -o <file> (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>)
  cmd_match_titles_to_keywords.py --version

Options:
  -o <file>, --output <file> output file with job match results 
  -i <file>, --input <file> input JSON data file with jobs and keywords
""" + COMMON_OPTIONS

if __name__ == '__main__':
    args = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__)

    try:
        matcher = TaskMatchJobsToKeywords(**args)
        matcher.export_results()
    except Exception as ex:
        logmsg(f'Unable to match title keywords: {ex}')
        raise ex
Esempio n. 7
0
#  License for the specific language governing permissions and limitations
#  under the License.
###########################################################################
from helpers import docopt_ext, COMMON_OPTIONS
from task_generate_broken_plugins_data import TaskGenerateBrokenPluginReportData

from util_log import logmsg
cli_usage = """
Usage:
  {} (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) -o <outputfile>
  {} --version

Options:
  -o <file>, --output <file> output file with job match results 
""" + COMMON_OPTIONS

if __name__ == '__main__':

    args = docopt_ext(cli_usage.format(__file__, __file__),
                      version='0.1.1rc',
                      filename=__file__)

    try:
        if "output" in args and args["output"]:
            reporter = TaskGenerateBrokenPluginReportData(**args)
        else:
            raise Exception("Missing output parameter.")
    except Exception as ex:
        logmsg(f'Unable to generate broken plugin report data:  {ex}')
        raise ex
Esempio n. 8
0
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#  License for the specific language governing permissions and limitations
#  under the License.
###########################################################################
from task_add_newposts_to_user import TaskAddNewMatchesToUser
from helpers import docopt_ext, COMMON_OPTIONS
from util_log import logmsg

cli_usage = """
Usage:
  cmd_add_newpostings_to_user.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) --jobuserid <userid> --jobsite <jobsite>
  cmd_add_newpostings_to_user.py --version
  
Options:
  --jobuserid <userid>     user_id for user to add new matches
  -j <jobsite> --jobsite <jobsite>   jobsitekey for site to add listings from
""" + COMMON_OPTIONS

if __name__ == '__main__':
    args = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__)

    try:
        matcher = TaskAddNewMatchesToUser(**args)
    except Exception as ex:
        logmsg(f'Unable to add job matches to user: {ex}')
        raise ex