def load_csv_data(csvfilename, fields=None): """ Args: csvfilename: rowkeyname: """ import os logmsg(f'Loading {csvfilename}...') from io import open csv_fp = open(csvfilename, 'r') data = [] csv_reader = None try: with csv_fp: csv_reader = csv.DictReader(csv_fp, fieldnames=fields, delimiter=",", quoting=csv.QUOTE_ALL) fields = csv_reader.fieldnames for row in csv_reader: if list(row.values()) == fields: continue if len(fields) == 1: data.append(row[fields[0]]) else: data.append(row) logdebug(f'Loaded {len(data)} rows from {csvfilename}') return data except Exception as err: print(err) pass
def batch_tokenize_strings(self, list_data, field, field_tokenized="tokenized", ret_type="string", maxlength=None): """ Args: list_data: field: string field_tokenized: string ret_type: string :type field_tokenized: object """ for k in list(list_data.keys()): if isinstance(k, str) and len(k) == 0: logmsg("String value for key was empty. Skipping...") continue tokens = self.tokenize_string(list_data[k][field]) final_tokens = [] if maxlength and tokens: fieldval = "|{}|".join(tokens) while (len(fieldval) >= maxlength): tokens.reverse() tokens.pop() tokens.reverse() fieldval = "|{}|".join(tokens) if ret_type == "list": final_tokens = list(set(tokens)) elif ret_type == "set": final_tokens = set(tokens) elif ret_type == "dict": final_tokens = OrderedDict(zip(tokens, tokens)) else: # if ret_type == "string" or ret_type is None: final_tokens = f'|{"|".join(tokens)}|' list_data[k][field_tokenized] = final_tokens return list_data
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. ########################################################################### from task_dedupe_jobs import TaskDedupeJobPosting from helpers import docopt_ext, COMMON_OPTIONS from util_log import logmsg cli_usage = """ Usage: cmd_mark_duplicates.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) cmd_mark_duplicates.py --version Options: -o <file>, --output <file> output JSON file with ID pairs of duplicate listings -i <file>, --input <file> input JSON data file with job postings """ + COMMON_OPTIONS if __name__ == '__main__': arguments = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__) try: matcher = TaskDedupeJobPosting(**arguments) matcher.dedupe_jobs() except Exception as ex: logmsg(f'Unable to deduplicate job postings: {ex}') raise ex
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. ########################################################################### from util_log import logmsg from helpers import docopt_ext, COMMON_OPTIONS from task_tokenize_jobtitles import TaskAddTitleTokens cli_usage = """ Usage: cmd_update_title_tokens.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) cmd_update_title_tokens.py --version Options: """ + COMMON_OPTIONS if __name__ == '__main__': arguments = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__) try: toks = TaskAddTitleTokens(**arguments) toks.update_jobs_without_tokens() except Exception as ex: logmsg(f'Unable to update job title tokes: {ex}') raise ex
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. ########################################################################### from task_find_and_match_places import FindPlacesFromDBLocationsTask from helpers import docopt_ext, COMMON_OPTIONS from util_log import logmsg cli_usage = """ Usage: cmd_set_geolocations.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) -s <server> cmd_set_geolocations.py --version Options: -s <server>, --server <server> hostname for geocode api server [default: http://0.0.0.0:5000] """ + COMMON_OPTIONS if __name__ == '__main__': args = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__) try: matcher = FindPlacesFromDBLocationsTask(**args) matcher.update_all_locations(**args) except Exception as ex: logmsg(f'Unable to set geolocations: {ex}') raise ex
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. ########################################################################### from helpers import docopt_ext, COMMON_OPTIONS from task_match_titles import TaskMatchJobsToKeywords from util_log import logmsg cli_usage = """ Usage: cmd_match_titles_to_keywords.py -i <file> -o <file> (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) cmd_match_titles_to_keywords.py --version Options: -o <file>, --output <file> output file with job match results -i <file>, --input <file> input JSON data file with jobs and keywords """ + COMMON_OPTIONS if __name__ == '__main__': args = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__) try: matcher = TaskMatchJobsToKeywords(**args) matcher.export_results() except Exception as ex: logmsg(f'Unable to match title keywords: {ex}') raise ex
# License for the specific language governing permissions and limitations # under the License. ########################################################################### from helpers import docopt_ext, COMMON_OPTIONS from task_generate_broken_plugins_data import TaskGenerateBrokenPluginReportData from util_log import logmsg cli_usage = """ Usage: {} (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) -o <outputfile> {} --version Options: -o <file>, --output <file> output file with job match results """ + COMMON_OPTIONS if __name__ == '__main__': args = docopt_ext(cli_usage.format(__file__, __file__), version='0.1.1rc', filename=__file__) try: if "output" in args and args["output"]: reporter = TaskGenerateBrokenPluginReportData(**args) else: raise Exception("Missing output parameter.") except Exception as ex: logmsg(f'Unable to generate broken plugin report data: {ex}') raise ex
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. ########################################################################### from task_add_newposts_to_user import TaskAddNewMatchesToUser from helpers import docopt_ext, COMMON_OPTIONS from util_log import logmsg cli_usage = """ Usage: cmd_add_newpostings_to_user.py (-c <dbstring> | --dsn <dbstring> | --host <hostname> --port <portid> --database <dbstring> --user <userstring> --password <userpass>) --jobuserid <userid> --jobsite <jobsite> cmd_add_newpostings_to_user.py --version Options: --jobuserid <userid> user_id for user to add new matches -j <jobsite> --jobsite <jobsite> jobsitekey for site to add listings from """ + COMMON_OPTIONS if __name__ == '__main__': args = docopt_ext(cli_usage, version='0.1.1rc', filename=__file__) try: matcher = TaskAddNewMatchesToUser(**args) except Exception as ex: logmsg(f'Unable to add job matches to user: {ex}') raise ex